def run_GAM(X, Y, get_importance=False, n_splines=20, folds=10): # set up GAM formula = s(0, n_splines) for i in range(1, X.shape[1]): formula = formula + s(i, n_splines) gam = LinearGAM(formula) gam.fit(X, X.iloc[:,0]) # run full model GAM_results = {} for name, y in Y.iteritems(): print("\nFitting for %s\n" % name) CV = BalancedKFold(folds) importances = {k:[] for k in X.columns} pred=np.zeros(y.shape[0]) for train,test in CV.split(X,y): Xtrain = X.iloc[train,:] ytrain = y.iloc[train] Xtest = X.iloc[test,:] ytest = y.iloc[test] gam = LinearGAM(formula) gam.gridsearch(Xtrain, ytrain) # out of fold p = gam.predict(Xtest) if len(p.shape)>1: p=p[:,0] pred[test]=p if get_importance: # get importances, defined as the predictive ability of each variable on its own importance_out = get_importances(Xtrain, ytrain, Xtest, ytest) for k,v in importance_out.items(): importances[k].append(v) cv_scores = [{'r': np.corrcoef(y,pred)[0,1], 'R2': np.corrcoef(y,pred)[0,1]**2, 'MAE': mean_absolute_error(y,pred)}] # insample gam.gridsearch(X, y) in_pred = gam.predict(X) in_scores = [{'r': np.corrcoef(y,in_pred)[0,1], 'R2': np.corrcoef(y,in_pred)[0,1]**2, 'MAE': mean_absolute_error(y,in_pred)}] GAM_results[name] = {'scores_cv': cv_scores, 'scores_insample': in_scores, 'pred_vars': X.columns, 'importances': importances, 'model': gam} return GAM_results
def BAM(X, y): # model implementation by PYGAM gam = LinearGAM(s(0, spline_order=3) + s(1, spline_order=3) + te(0, 1)) gam.gridsearch(X, y) # print(gam.gridsearch(X, y).summary()) return gam
def run_GAM(X, Y, get_importance=False, n_splines=20, folds=10): # set up GAM formula = s(0, n_splines) for i in range(1, X.shape[1]): formula = formula + s(i, n_splines) gam = LinearGAM(formula) gam.fit(X, X.iloc[:, 0]) # run full model GAM_results = {} for name, y in Y.iteritems(): print("\nFitting for %s\n" % name) CV = BalancedKFold(folds) importances = {k: [] for k in X.columns} pred = np.zeros(y.shape[0]) for train, test in CV.split(X, y): Xtrain = X.iloc[train, :] ytrain = y.iloc[train] Xtest = X.iloc[test, :] ytest = y.iloc[test] gam = LinearGAM(formula) gam.gridsearch(Xtrain, ytrain) # out of fold p = gam.predict(Xtest) if len(p.shape) > 1: p = p[:, 0] pred[test] = p if get_importance: # get importances, defined as the predictive ability of each variable on its own importance_out = get_importances(Xtrain, ytrain, Xtest, ytest) for k, v in importance_out.items(): importances[k].append(v) cv_scores = [{ 'r': np.corrcoef(y, pred)[0, 1], 'R2': np.corrcoef(y, pred)[0, 1]**2, 'MAE': mean_absolute_error(y, pred) }] # insample gam.gridsearch(X, y) in_pred = gam.predict(X) in_scores = [{ 'r': np.corrcoef(y, in_pred)[0, 1], 'R2': np.corrcoef(y, in_pred)[0, 1]**2, 'MAE': mean_absolute_error(y, in_pred) }] GAM_results[name] = { 'scores_cv': cv_scores, 'scores_insample': in_scores, 'pred_vars': X.columns, 'importances': importances, 'model': gam } return GAM_results
def pspline(time, flux, edge_cutoff, max_splines, return_nsplines, verbose): try: from pygam import LinearGAM, s except: raise ImportError('Could not import pygam') newflux = flux.copy() newtime = time.copy() detrended_flux = flux.copy() / np.nanmedian(newflux) for i in range(constants.PSPLINES_MAXITER): mask_outliers = np.ma.where( np.abs(1 - detrended_flux) < constants.PSPLINES_STDEV_CUT * np.std(detrended_flux)) newtime, newflux = cleaned_array(newtime[mask_outliers], newflux[mask_outliers]) gam = LinearGAM(s(0, n_splines=max_splines)) search_gam = gam.gridsearch(newtime[:, np.newaxis], newflux, progress=False) trend = search_gam.predict(newtime) detrended_flux = newflux / trend stdev = np.std(detrended_flux) mask_outliers = np.ma.where( np.abs(1 - detrended_flux) > constants.PSPLINES_STDEV_CUT * np.std(detrended_flux)) if verbose: print('Iteration:', i + 1, 'Rejected outliers:', len(mask_outliers[0])) # Check convergence if len(mask_outliers[0]) == 0: print('Converged.') break # Final iteration, applied to unclipped time series (interpolated over clipped values) mask_outliers = np.ma.where( np.abs(1 - detrended_flux) < constants.PSPLINES_STDEV_CUT * stdev) newtime, newflux = cleaned_array(newtime[mask_outliers], newflux[mask_outliers]) gam = LinearGAM(s(0, n_splines=max_splines)) search_gam = gam.gridsearch(newtime[:, np.newaxis], newflux, progress=False) trend = search_gam.predict(time) # Cut off edges if edge_cutoff > 0: low_index = np.argmax(time > (min(time) + edge_cutoff)) hi_index = np.argmax(time > (max(time) - edge_cutoff)) trend[:low_index] = np.nan trend[hi_index:] = np.nan nsplines = np.ceil(gam.statistics_['edof']) return trend, nsplines
def GAM_linear(X, y): X= X.to_numpy() y = y.to_numpy() from pygam import LinearGAM, s, f, te gam = LinearGAM(s(0) +s(1) +f(2)) gam.gridsearch(X,y) y_pred = gam.predict(X) y_pred = pd.DataFrame(y_pred) y_pred['actual'] =y y_pred['residual'] = y_pred.actual-y_pred[0] return gam, gam.summary(), y_pred
def interp_gam(data): valid = np.isfinite(data.stream_dist.values[:, 0]) sample_xy = data.sample_xy.values[valid] sample_st = data.stream_dist.values[valid] sample_z = data.sample_z.values[valid] if np.sum(valid) == 0: return np.nan gam = LinearGAM( s(0, n_splines=4) + s(1, n_splines=5) + te(0, 1, n_splines=4)).gridsearch(sample_st, sample_z) z_pred = gam.predict(np.array([[0, 0]]))[0] return z_pred
def _fit_gam(self): """Fits a GAM that predicts the outcome from the treatment and GPS """ X = np.column_stack((self.T.values, self.gps)) y = np.asarray(self.y) return LinearGAM( s(0, n_splines=self.n_splines, spline_order=self.spline_order) + s(1, n_splines=self.n_splines, spline_order=self.spline_order), max_iter=self.max_iter, lam=self.lambda_, ).fit(X, y)
def get_gam_model(self, features: [Field], model_type=TYPE_LINEAR): model_spec = f(0) if features[0].is_factor() else s( 0, n_splines=self.num_splines) for i in range(1, len(features)): model_spec += f(i) if features[i].is_factor() else s( i, n_splines=self.num_splines) if model_type == TYPE_LINEAR: return LinearGAM(model_spec) if model_type == TYPE_LOGISTIC: return LogisticGAM(model_spec)
def _fit_gam(self): """Fits a GAM that predicts the outcome (continuous or binary) from the treatment and GPS""" X = np.column_stack((self.T.values, self.gps)) y = np.asarray(self.y) model_type_dict = {"continuous": LinearGAM, "binary": LogisticGAM} return model_type_dict[self.outcome_type]( s(0, n_splines=self.n_splines, spline_order=self.spline_order) + s(1, n_splines=self.n_splines, spline_order=self.spline_order), max_iter=self.max_iter, lam=self.lambda_, ).fit(X, y)
def GAM2(self): """GAM of splines, where we perform variable selection to find the best model.""" from pygam import LogisticGAM, s, l, f terms = s(0) + s(1) + s(2) + s(3) + s(4) + s(5) + s(6) + s(7) gam = LogisticGAM(terms=terms, fit_intercept=False) mod = gam.gridsearch(self.Xtrain.values, self.ytrain, \ lam=np.logspace(-3, 3, 11)) # Generate the model mod.summary() # Pseudo-R2: 0.6449 ypred = mod.predict(self.Xtest) MSE1 = np.mean((self.ytest - ypred.reshape(-1, 1))**2).values if self.plot: plt.plot(range(len(ypred.reshape(-1,1))),\ ypred.reshape(-1,1)-0.5,"r.", label='GAM model') plt.plot(range(len(self.ytest)), self.ytest, "b.", label='Testing Data') plt.legend() plt.title("GAM model with linear terms. Prediction data is\n"\ + "scaled downwards by 0.5 for visual purposes.") plt.ylabel("FFVC score") plt.xlabel("Sample no.") plt.show()
def smoother_expectileGAM(x,y,X,**kwargs): from pygam import s, ExpectileGAM if isinstance(x,list): x = np.array(x) if isinstance(y,list): y = np.array(y) if X is None: X = deepcopy(x) x = x.reshape(len(x),1) X = X.reshape(len(X),1) #if 'n_splines' in kwargs.keys(): # n_splines = kwargs['n_splines'] #else: # # This is because the automatic approach is too smooth # n_splines = int(len(y)/5) if 'expectile' in kwargs.keys(): expectile = kwargs['expectile'] else: expectile = .5 #gam50 = ExpectileGAM(expectile=expectile,terms=s(0),\ # n_splines=n_splines).gridsearch(x, y) gam50 = ExpectileGAM(expectile=expectile,terms=s(0),\ ).gridsearch(x, y) # This practice of copying makes the models # less likely to cross and much faster # https://pygam.readthedocs.io/en/latest/notebooks/tour_of_pygam.html # and copy the smoothing to the other models pred = gam50.predict(X) return pred
def smoother_linearGAM(x,y,X,**kwargs): from pygam import LinearGAM, l, s if isinstance(x,list): x = np.array(x) x = x.reshape(len(x),1) if isinstance(y,list): y = np.array(y) if isinstance(X,list): X = np.array(X) if X is None: X = x.reshape(len(x),1) else: X = X.reshape(len(X),1) #if 'n_splines' in kwargs.keys(): # n_splines = kwargs['n_splines'] #else: # # This is because the automatic approach is too smooth # n_splines = int(len(y)/5) #gam = LinearGAM(n_splines=n_splines,\ # terms=s(0,basis='ps')\ # ).gridsearch(x, y) gam = LinearGAM( terms=s(0,basis='ps')\ ).gridsearch(x, y ) # sample on the input grid means = gam.predict(X) return means
def fit_gam_with_fix_dof(X, Y, dof): ##{{{ lam_up = 1e2 lam_lo = 1e-2 tol = 1e-2 diff = 1. + tol n_splines = int(dof + 2) nit = 0 while diff > tol: lam = (lam_up + lam_lo) / 2. gam_model = pg.LinearGAM( pg.s(0, n_splines=n_splines, penalties="auto", lam=lam) + pg.l(1, penalties=None)) gam_model.fit(X, Y) current_dof = gam_model.statistics_["edof"] if current_dof < dof: lam_up = lam else: lam_lo = lam diff = np.abs(dof - current_dof) nit += 1 if nit % 100 == 0: lam_up = 1e2 lam_lo = 1e-2 n_splines += 1 return gam_model
def run_gam_effective_r_from_empirical(state_data, n_splines=25, algo=GammaGAM, n_bootstrap=100): # for numerical stability epsilon = 1 R_series = ( state_data['confirmed_new'] / state_data['confirmed_total'].shift(1)).dropna() * 1 / RECOVERY_RATE X = np.arange(R_series.shape[0]) y = R_series.values + epsilon # running GAM in bootstrap bootstrap = [] for _ in range(n_bootstrap): weights = dirichlet([1] * R_series.shape[0]).rvs(1) gam = algo(s(0, n_splines) + l(0)) gam.fit(X, y, weights=weights[0]) bootstrap.append(gam) preds = pd.DataFrame([m.predict(X) - epsilon for m in bootstrap]).T estimate_rt = pd.DataFrame(index=R_series.index) estimate_rt['ML'] = preds.mean(axis=1).values estimate_rt['Low_90'] = preds.quantile(0.05, axis=1).values estimate_rt['High_90'] = preds.quantile(0.95, axis=1).values return estimate_rt.dropna()
def _build_ensemble_feature(self, X, base_pred): """Builds featurre array and corresponding GAM TermList. Terms corresponding to X will be summation of dimension-wise splines, plus a tensor-product term across all dimension. """ ensemble_term_func = s if self.nonlinear_ensemble else l ens_feature = np.asarray(list(base_pred.values())).T term_list = [ensemble_term_func(dim_index) for dim_index in range(ens_feature.shape[1])] # optionally, add residual process if self.model_residual: # build gam terms term_list += [s(dim_index) for dim_index in range(ens_feature.shape[1], ens_feature.shape[1] + X.shape[1])] if X.shape[1] > 1: term_list += [te(*list(ens_feature.shape[1] + np.array(range(X.shape[1]))))] # update features ens_feature = np.concatenate([ens_feature, X], axis=1) gam_feature_terms = TermList(*term_list) return ens_feature, gam_feature_terms
def spline_classification_plot(ax, X, y, X_eval, y_eval, gam_ref): # gam = LogisticGAM(s(0)).gridsearch(X, y) # documentation of LogisticGAM: https://pygam.readthedocs.io/en/latest/api/logisticgam.html gam = LogisticGAM(s(0, constraints='monotonic_inc', n_splines=5)).gridsearch(X, y) # add a linear term #XX = gam.generate_X_grid(term=0) XX = np.linspace(0, 1, 100) ax.plot(XX, gam.predict_proba(XX), c='g') ax.plot(XX, gam.confidence_intervals(XX, width=0.95), c='r', ls='--') # compute ece and acc after calibration y_ = gam.predict_proba(X_eval) ece = EceEval(np.array([1 - y_, y_]).T, y_eval, num_bins=100) mce = MceEval(np.array([1 - y_, y_]).T, y_eval, num_bins=100) brier = BrierEval(np.array([1 - y_, y_]).T, y_eval) mse = MseEval(gam, gam_ref, num_bins=100) acc = gam.accuracy(X_eval, y_eval) ax.text(0.05, 0.75, 'ECE=%.4f\nMCE=%.4f\nBrier=%.4f\nACC=%.4f\nMSE=%.4f' % (ece, mce, brier, acc, mse), size=6, ha='left', va='center', bbox={ 'facecolor': 'green', 'alpha': 0.5, 'pad': 4 }) ax.set_xlim(0.0, 1.0) ax.set_ylim(0.0, 1.0) confi = gam.confidence_intervals(X_eval, width=0.95) print gam.summary() return ece, mce, brier, acc, mse, ax, confi
def _fit_final_gam(self): """We now regress the original treatment values against the pseudo-outcome values """ return LinearGAM(s(0, n_splines=30, spline_order=3), max_iter=500, lam=self.bandwidth).fit(self.t_data, y=self.pseudo_out)
def spline_calibration(X, y): gam = LogisticGAM(s(0, constraints='monotonic_inc')).gridsearch( X, y) # add a linear term # documentation of LogisticGAM: https://pygam.readthedocs.io/en/latest/api/logisticgam.html # gam = LogisticGAM(s(0, constraints='monotonic_inc')).gridsearch(X, y) # add a linear term # compute ece and acc after calibration y_ = gam.predict_proba(X) return y_
def pspline(time, flux): try: from pygam import LinearGAM, s except: raise ImportError('Could not import pygam') newflux = flux.copy() newtime = time.copy() detrended_flux = flux.copy() for i in range(constants.PSPLINES_MAXITER): mask_outliers = numpy.ma.where( 1 - detrended_flux < constants.PSPLINES_STDEV_CUT * numpy.std(detrended_flux)) newtime, newflux = cleaned_array(newtime[mask_outliers], newflux[mask_outliers]) gam = LinearGAM(s(0, n_splines=constants.PSPLINES_MAX_SPLINES)) search_gam = gam.gridsearch(newtime[:, numpy.newaxis], newflux, progress=False) trend = search_gam.predict(newtime) detrended_flux = newflux / trend stdev = numpy.std(detrended_flux) mask_outliers = numpy.ma.where( 1 - detrended_flux > constants.PSPLINES_STDEV_CUT * numpy.std(detrended_flux)) print('Iteration:', i + 1, 'Rejected outliers:', len(mask_outliers[0])) # Check convergence if len(mask_outliers[0]) == 0: print('Converged.') break # Final iteration, applied to unclipped time series (interpolated over clipped values) mask_outliers = numpy.ma.where( 1 - detrended_flux < constants.PSPLINES_STDEV_CUT * stdev) newtime, newflux = cleaned_array(newtime[mask_outliers], newflux[mask_outliers]) gam = LinearGAM(s(0, n_splines=constants.PSPLINES_MAX_SPLINES)) search_gam = gam.gridsearch(newtime[:, numpy.newaxis], newflux, progress=False) trend = search_gam.predict(time) return trend
def smooth_gam(x, y, n_splines=100, lam=10): from pygam import ExpectileGAM, LinearGAM, s, f gam = LinearGAM(s(0, n_splines=n_splines), lam=lam).fit(x, y) # gam = ExpectileGAM(s(0, n_splines=n_splines), expectile=0.5, lam=lam).gridsearch(x.values.reshape((-1,1)), y) XX = gam.generate_X_grid(term=0) confi = gam.confidence_intervals(XX) # confi = gam.prediction_intervals(XX) ym = gam.predict_mu(XX) return XX[:, 0], ym, confi
def BAM(): gam = GAM(s(0, n_splines=25, spline_order=3, constraints='concave', penalties = 'auto', basis = 'cp', edge_knots=[147,147]) + s(1, n_splines=25, spline_order=3, constraints='concave', penalties = 'auto', basis = 'cp', edge_knots=[147,147]) + te(0, 1, dtype=['numerical', 'numerical']), distribution= 'normal', link = 'identity', fit_intercept=True) print(gam.gridsearch(X, y, n_splines=np.arange(50)).summary()) plt.scatter(X[:, 0][0:56], y[0:56], s=3, linewidths=0.0001, label='data') plt.plot(X[:, 0][0:56], gam.predict(X[0:56]), color='red', linewidth=1, label='prediction') plt.legend() plt.title('Basic Additive Model') plt.show() # error calculation rmse_val = rmse(np.array(y), np.array(gam.predict(X))) print("RMSE is: " + str(rmse_val)) mae = mean_absolute_error(y, gam.predict(X)) print("MAE is: " + str(mae)) mape = mean_absolute_percentage_error(np.array(y), np.array(gam.predict(X))) print("MAPE is: " + str(mape))
def cleaner_expectileGAM(x,y,**kwargs): from pygam import s, ExpectileGAM if isinstance(x,list): x = np.array(x) if isinstance(y,list): y = np.array(y) X = x.reshape(len(x),1) #if 'n_splines' in kwargs.keys(): # n_splines = kwargs['n_splines'] #else: # # This is because the automatic approach is too smooth # n_splines = int(len(y)/5) #gam50 = ExpectileGAM(expectile=.5,terms=s(0),\ # n_splines=n_splines).gridsearch(X, y) gam50 = ExpectileGAM(expectile=.5,terms=s(0),\ ).gridsearch(X, y) # This practice of copying makes the models # less likely to cross and much faster # https://pygam.readthedocs.io/en/latest/notebooks/tour_of_pygam.html # and copy the smoothing to the other models lam = gam50.lam # now fit a few more models if 'expectile_ulim' in kwargs.keys(): expectile_ulim = kwargs['expectile_ulim'] else: expectile_ulim = .95 if 'expectile_llim' in kwargs.keys(): expectile_llim = kwargs['expectile_llim'] else: expectile_llim = .05 #gam_ulim = ExpectileGAM(expectile=expectile_ulim, lam=lam, # terms=s(0),n_splines=n_splines).fit(X, y) #gam_llim = ExpectileGAM(expectile=expectile_llim, lam=lam, # terms=s(0),n_splines=n_splines).fit(X, y) gam_ulim = ExpectileGAM(expectile=expectile_ulim, lam=lam, terms=s(0)).fit(X, y) gam_llim = ExpectileGAM(expectile=expectile_llim, lam=lam, terms=s(0)).fit(X, y) ulim = gam_ulim.predict(X) llim = gam_llim.predict(X) idx = [i for i in range(len(y)) \ if (y[i]>ulim[i] or y[i]<llim[i])] return idx
def get_importances(X, y, Xtest, ytest): importances = {} for predictor, vals in X.iteritems(): gam = LinearGAM(s(0), fit_intercept=False) gam.fit(vals, y) gam.gridsearch(vals, y) pred = gam.predict(Xtest[predictor]) # define importances as the R2 for that factor alone R2 = np.corrcoef(ytest,pred)[0,1]**2 importances[predictor] = R2 return importances
def _fit_gams(self, temp_t, temp_m, temp_y): """Fits the mediator and outcome GAMs""" temp_mediator_model = LinearGAM( s(0, n_splines=self.n_splines, spline_order=self.spline_order), fit_intercept=True, max_iter=self.max_iter, lam=self.lambda_, ) temp_mediator_model.fit(temp_t, temp_m) temp_outcome_model = LinearGAM( s(0, n_splines=self.n_splines, spline_order=self.spline_order) + s(1, n_splines=self.n_splines, spline_order=self.spline_order), fit_intercept=True, max_iter=self.max_iter, lam=self.lambda_, ) temp_outcome_model.fit(pd.concat([temp_t, temp_m], axis=1), temp_y) return temp_mediator_model, temp_outcome_model
def get_importances(X, y, Xtest, ytest): importances = {} for predictor, vals in X.iteritems(): gam = LinearGAM(s(0), fit_intercept=False) gam.fit(vals, y) gam.gridsearch(vals, y) pred = gam.predict(Xtest[predictor]) # define importances as the R2 for that factor alone R2 = np.corrcoef(ytest, pred)[0, 1]**2 importances[predictor] = R2 return importances
def _fit(self, X, y, mylam=None, **kwargs): if isinstance(X, pd.DataFrame): X = X.values if not self.fit_binary_feat_as_factor_term: self.model = self.model_cls(max_iter=self.max_iter, n_splines=self.n_splines, **self.kwargs) else: formulas = [] for idx, feat_name in enumerate(self.feature_names): num_unique_x = len(self.X_values_counts[feat_name]) if num_unique_x < 2: continue if num_unique_x == 2: formulas.append(f(idx)) else: formulas.append(s(idx)) the_formula = formulas[0] for term in formulas[1:]: the_formula += term self.model = self.model_cls(the_formula, max_iter=self.max_iter, n_splines=self.n_splines, **self.kwargs) if not self.search: # Just fit the model with this lam return self.model.fit(X, y, **kwargs) if mylam is None: mylam = self.search_lam # do a grid search over here try: print('search range from %f to %f' % (mylam[0], mylam[-1])) self.model.gridsearch(X, y, lam=mylam, **kwargs) except (np.linalg.LinAlgError, pygam.utils.OptimizationError) as e: print('Get the following error:', str(e), '\nRetry the grid search') if hasattr(self.model, 'coef_'): del self.model.coef_ self._fit(X, y, mylam=mylam[1:], **kwargs) if not hasattr(self.model, 'statistics_'): # Does not finish the training raise Exception('Training fails.') return self
def _predict_gam(ds, conf, time, quantiles=None, size=None, return_gam=False, return_counts=False, max_time_diff=200): # insert 0s for every timeseries in the ensemble for the reference # period at -35 BP (1985) climate = conf.climate + '_ensemble' age = conf.age + '_ensemble' x = ds[age].values.ravel() y = ds[climate].values.ravel() mask = (~np.isnan(x)) & (~np.isnan(y)) if not mask.any(): return else: x = x[mask] y = y[mask] gam = pygam.LinearGAM(pygam.s(0)).gridsearch( x[:, np.newaxis], y, progress=False) time = np.asarray(time) ret = (gam.predict(time), ) if quantiles is not None: ret = ret + (gam.prediction_intervals(time, quantiles=quantiles), ) if size is not None: ret = ret + (gam.sample( x[:, np.newaxis], y, sample_at_X=time, n_draws=size).T, ) if return_counts: tree = BallTree(ds[age].values.ravel()[:, np.newaxis]) counts = tree.query_radius(time[:, np.newaxis], return_counts, count_only=True).astype(float) ret = ret + (counts, ) # look how many samples in the ensemble fall into the `max_time_diff` # time interval around the predicted time tree = BallTree(ds[age].values.ravel()[:, np.newaxis]) counts = tree.query_radius(time[:, np.newaxis], max_time_diff, count_only=True) idx = counts < 100 if idx.any(): for arr in ret: arr[idx] = np.nan if return_gam: return ret + (gam, ) else: return ret
def fit(self): S = s(0) if self.feature_names[0] in self.numerical_features else f(0) for i in range(1, len(self.feature_names)): if self.feature_names[i] in self.numerical_features: S += s(i) else: S += f(i) if self.mode == 'regression': gam = LinearGAM(S) gam.gridsearch(self.X_train, self.y_train) self._is_fitted = True self.explainer = gam elif self.mode == 'classification': gam = LogisticGAM(S) gam.gridsearch(np.array(self.X_train), self.y_train) self._is_fitted = True self.explainer = gam else: raise NameError( 'ERROR: mode should be regression or classification')
def calibrate_propensities(propensities, treatment): """Post-hoc calibration of propensity scores given the true treatments Args: propensities: propensity scores treatment: treatment indicator Returns: p: calibrated version of the propensities given """ gam = LogisticGAM(s(0)).fit(propensities, treatment) return gam.predict_proba(propensities)
def create_rand_gam(number_of_searches, new_values, pred_y, y, pca_splines, pca_lam, pred_splines, pred_lam, pred_factor): lams = np.random.rand(number_of_searches, new_values.shape[1] + 1) # random points on [0, 1], with shape (1000, 3) lams = lams * 8 - 4 # shift values to -4, 4 lams = 10**lams # transforms values to 1e-4, 1e4 new_values = np.append(new_values, np.array(pred_y).reshape(-1, 1), axis=1) titles = [] for i in range(new_values.shape[1] - 1): titles.append(str(i)) if i == 0: x = s(i, n_splines=pca_splines, lam=pca_lam) else: x = x + s(i, n_splines=pca_splines, lam=pca_lam) if pred_factor: x = x + pygam.terms.f(i + 1, lam=pred_lam) else: x = x + s(i + 1, n_splines=pred_splines, lam=pred_lam) rand_gam = LogisticGAM(x).gridsearch(new_values, y, lam=lams) return rand_gam, new_values, titles
def updateEmpTauX(self, bFit=True, mask=None): if mask is None: mask = np.ones((self.V, self.S)) square_diff_matrix = self.exp_square_diff_matrix() mXFit = np.ma.masked_where(mask == 0, self.X) X1DFit = np.ma.compressed(mXFit) logX1DFit = np.log(0.5 + X1DFit) mSDMFit = np.ma.masked_where(mask == 0, square_diff_matrix) mFitFit = np.ma.compressed(mSDMFit) logMFitFit = np.log(mFitFit + NMF_VB.minVar) if bFit: try: self.gam = LinearGAM( s(0, n_splines=5, constraints='monotonic_inc')).fit(logX1DFit, logMFitFit) except ValueError: print("Performing fixed tau") self.updateFixedTau(mask) return mX = np.ma.masked_where(mask == 0, self.X) X1D = np.ma.compressed(mX) logX1D = np.log(0.5 + X1D) yest_sm = self.gam.predict(logX1D) mBetaTau = self.beta * (X1D + 0.5) + 0.5 * np.exp(yest_sm) np.place(self.betaTau, mask == 1, mBetaTau) mExpTau = (self.alpha + 0.5) / mBetaTau np.place(self.expTau, mask == 1, mExpTau) mLogTau = digamma(self.alpha + 0.5) - np.log(mBetaTau) np.place(self.expLogTau, mask == 1, mLogTau)
def mean(self, smooth=None, **kwargs): """Compute an estimate of the mean. Parameters ---------- smooth: str, default=None Name of the smoothing method to use. Currently, not implemented. Keyword Args ------------ kernel_name: str, default='epanechnikov' Name of the kernel used for local polynomial smoothing. degree: int, default=1 Degree used for local polynomial smoothing. bandwidth: float, default=1 Bandwidth used for local polynomial smoothing. n_basis: int, default=10 Number of splines basis used for GAM smoothing. Returns ------- obj: DenseFunctionalData object An estimate of the mean as a DenseFunctionalData object with the same argvals as `self` and one observation. """ mean_estim = self.values.mean(axis=0) if smooth is not None: argvals = self.argvals['input_dim_0'] if self.n_dim > 1: raise ValueError('Only one dimensional data can be smoothed.') if smooth == 'LocalLinear': p = self.n_points['input_dim_0'] points = kwargs.get('points', 0.5) neigh = kwargs.get('neighborhood', np.int(p * np.exp(-(np.log(np.log(p)))**2))) data_smooth = self.smooth(points=points, neighborhood=neigh) mean_estim = data_smooth.values.mean(axis=0) elif smooth == 'GAM': n_basis = kwargs.get('n_basis', 10) argvals = self.argvals['input_dim_0'] mean_estim = pygam.LinearGAM(pygam.s(0, n_splines=n_basis)).\ fit(argvals, mean_estim).\ predict(argvals) elif smooth == 'SmoothingSpline': ss = SmoothingSpline() mean_estim = ss.fit_predict(argvals, mean_estim) else: raise NotImplementedError('Smoothing method not implemented.') return DenseFunctionalData(self.argvals, mean_estim[np.newaxis])
import patsy as pt import numpy as np from plotly import tools import plotly.offline as py import plotly.graph_objs as go # Prep the dataset data = pd.read_csv( "/home/dusty/Econ8310/DataSets/HappinessWorld.csv") # Generate x and y matrices eqn = """happiness ~ -1 + freedom + family + year + economy + health + trust""" y,x = pt.dmatrices(eqn, data=data) # Initialize and fit the model gam = LinearGAM(s(0) + s(1) + s(2) + s(3) + s(4) + s(5)) gam = gam.gridsearch(np.asarray(x), y) # Specify plot shape titles = ['freedom', 'family', 'year', 'economy', 'health', 'trust'] fig = tools.make_subplots(rows=2, cols=3, subplot_titles=titles) fig['layout'].update(height=800, width=1200, title='pyGAM', showlegend=False) for i, title in enumerate(titles): XX = gam.generate_X_grid(term=i) pdep, confi = gam.partial_dependence(term=i, width=.95) trace = go.Scatter(x=XX[:,i], y=pdep, mode='lines', name='Effect') ci1 = go.Scatter(x = XX[:,i], y=confi[:,0], line=dict(dash='dash', color='grey'), name='95% CI') ci2 = go.Scatter(x = XX[:,i], y=confi[:,1], line=dict(dash='dash', color='grey'), name='95% CI')