def run_GAM(X, Y, get_importance=False, n_splines=20, folds=10): # set up GAM formula = s(0, n_splines) for i in range(1, X.shape[1]): formula = formula + s(i, n_splines) gam = LinearGAM(formula) gam.fit(X, X.iloc[:,0]) # run full model GAM_results = {} for name, y in Y.iteritems(): print("\nFitting for %s\n" % name) CV = BalancedKFold(folds) importances = {k:[] for k in X.columns} pred=np.zeros(y.shape[0]) for train,test in CV.split(X,y): Xtrain = X.iloc[train,:] ytrain = y.iloc[train] Xtest = X.iloc[test,:] ytest = y.iloc[test] gam = LinearGAM(formula) gam.gridsearch(Xtrain, ytrain) # out of fold p = gam.predict(Xtest) if len(p.shape)>1: p=p[:,0] pred[test]=p if get_importance: # get importances, defined as the predictive ability of each variable on its own importance_out = get_importances(Xtrain, ytrain, Xtest, ytest) for k,v in importance_out.items(): importances[k].append(v) cv_scores = [{'r': np.corrcoef(y,pred)[0,1], 'R2': np.corrcoef(y,pred)[0,1]**2, 'MAE': mean_absolute_error(y,pred)}] # insample gam.gridsearch(X, y) in_pred = gam.predict(X) in_scores = [{'r': np.corrcoef(y,in_pred)[0,1], 'R2': np.corrcoef(y,in_pred)[0,1]**2, 'MAE': mean_absolute_error(y,in_pred)}] GAM_results[name] = {'scores_cv': cv_scores, 'scores_insample': in_scores, 'pred_vars': X.columns, 'importances': importances, 'model': gam} return GAM_results
def BAM(X, y): # model implementation by PYGAM gam = LinearGAM(s(0, spline_order=3) + s(1, spline_order=3) + te(0, 1)) gam.gridsearch(X, y) # print(gam.gridsearch(X, y).summary()) return gam
def run_GAM(X, Y, get_importance=False, n_splines=20, folds=10): # set up GAM formula = s(0, n_splines) for i in range(1, X.shape[1]): formula = formula + s(i, n_splines) gam = LinearGAM(formula) gam.fit(X, X.iloc[:, 0]) # run full model GAM_results = {} for name, y in Y.iteritems(): print("\nFitting for %s\n" % name) CV = BalancedKFold(folds) importances = {k: [] for k in X.columns} pred = np.zeros(y.shape[0]) for train, test in CV.split(X, y): Xtrain = X.iloc[train, :] ytrain = y.iloc[train] Xtest = X.iloc[test, :] ytest = y.iloc[test] gam = LinearGAM(formula) gam.gridsearch(Xtrain, ytrain) # out of fold p = gam.predict(Xtest) if len(p.shape) > 1: p = p[:, 0] pred[test] = p if get_importance: # get importances, defined as the predictive ability of each variable on its own importance_out = get_importances(Xtrain, ytrain, Xtest, ytest) for k, v in importance_out.items(): importances[k].append(v) cv_scores = [{ 'r': np.corrcoef(y, pred)[0, 1], 'R2': np.corrcoef(y, pred)[0, 1]**2, 'MAE': mean_absolute_error(y, pred) }] # insample gam.gridsearch(X, y) in_pred = gam.predict(X) in_scores = [{ 'r': np.corrcoef(y, in_pred)[0, 1], 'R2': np.corrcoef(y, in_pred)[0, 1]**2, 'MAE': mean_absolute_error(y, in_pred) }] GAM_results[name] = { 'scores_cv': cv_scores, 'scores_insample': in_scores, 'pred_vars': X.columns, 'importances': importances, 'model': gam } return GAM_results
def pspline(time, flux, edge_cutoff, max_splines, return_nsplines, verbose): try: from pygam import LinearGAM, s except: raise ImportError('Could not import pygam') newflux = flux.copy() newtime = time.copy() detrended_flux = flux.copy() / np.nanmedian(newflux) for i in range(constants.PSPLINES_MAXITER): mask_outliers = np.ma.where( np.abs(1 - detrended_flux) < constants.PSPLINES_STDEV_CUT * np.std(detrended_flux)) newtime, newflux = cleaned_array(newtime[mask_outliers], newflux[mask_outliers]) gam = LinearGAM(s(0, n_splines=max_splines)) search_gam = gam.gridsearch(newtime[:, np.newaxis], newflux, progress=False) trend = search_gam.predict(newtime) detrended_flux = newflux / trend stdev = np.std(detrended_flux) mask_outliers = np.ma.where( np.abs(1 - detrended_flux) > constants.PSPLINES_STDEV_CUT * np.std(detrended_flux)) if verbose: print('Iteration:', i + 1, 'Rejected outliers:', len(mask_outliers[0])) # Check convergence if len(mask_outliers[0]) == 0: print('Converged.') break # Final iteration, applied to unclipped time series (interpolated over clipped values) mask_outliers = np.ma.where( np.abs(1 - detrended_flux) < constants.PSPLINES_STDEV_CUT * stdev) newtime, newflux = cleaned_array(newtime[mask_outliers], newflux[mask_outliers]) gam = LinearGAM(s(0, n_splines=max_splines)) search_gam = gam.gridsearch(newtime[:, np.newaxis], newflux, progress=False) trend = search_gam.predict(time) # Cut off edges if edge_cutoff > 0: low_index = np.argmax(time > (min(time) + edge_cutoff)) hi_index = np.argmax(time > (max(time) - edge_cutoff)) trend[:low_index] = np.nan trend[hi_index:] = np.nan nsplines = np.ceil(gam.statistics_['edof']) return trend, nsplines
def GAM_linear(X, y): X= X.to_numpy() y = y.to_numpy() from pygam import LinearGAM, s, f, te gam = LinearGAM(s(0) +s(1) +f(2)) gam.gridsearch(X,y) y_pred = gam.predict(X) y_pred = pd.DataFrame(y_pred) y_pred['actual'] =y y_pred['residual'] = y_pred.actual-y_pred[0] return gam, gam.summary(), y_pred
def get_importances(X, y, Xtest, ytest): importances = {} for predictor, vals in X.iteritems(): gam = LinearGAM(s(0), fit_intercept=False) gam.fit(vals, y) gam.gridsearch(vals, y) pred = gam.predict(Xtest[predictor]) # define importances as the R2 for that factor alone R2 = np.corrcoef(ytest,pred)[0,1]**2 importances[predictor] = R2 return importances
def get_importances(X, y, Xtest, ytest): importances = {} for predictor, vals in X.iteritems(): gam = LinearGAM(s(0), fit_intercept=False) gam.fit(vals, y) gam.gridsearch(vals, y) pred = gam.predict(Xtest[predictor]) # define importances as the R2 for that factor alone R2 = np.corrcoef(ytest, pred)[0, 1]**2 importances[predictor] = R2 return importances
def GAM1(self): """Generalized Additive Model with possible non-linear effects. Specific variables are modelled by splines. Can the possible non-linearities be captured by adding polynomial terms to the linear model? Fit such a model and comment on the two solutions.""" from pygam import LinearGAM, s, l, f """Non-linear effects are modeled by splines. Analyze the summary table and declare which factors should be splined. Do this depending on the so-called significance code of the table.""" terms = l(0)+l(1)+l(2)+l(3)+l(4)+l(5)+l(6)+l(7)+l(8)+l(9)+l(10)+l(11)\ +l(12)+l(13)+l(14)+l(15)+l(16)+l(17)+l(18)+l(19)+l(20)+l(21)+l(22)\ +l(23) gam = LinearGAM(terms=terms, fit_intercept=False) mod = gam.gridsearch(self.Xtrain.values, self.ytrain.values, \ lam=np.logspace(-3, 3, 11)) # Generate the model mod.summary() # Pseudo-R2: 0.6449 ypred = mod.predict(self.Xtest) MSE1 = np.mean((self.ytest - ypred.reshape(-1, 1))**2).values if self.plot: plt.plot(ypred.reshape(-1, 1), label='GAM model') plt.plot(self.ytest, label='Testing Data') plt.legend() plt.title("GAM model with linear terms") plt.ylabel("FFVC score") plt.xlabel("Sample no.") plt.show() """Repeat the study adding the 'auto' function, adding splines and polynomial contributions.""" gam = LinearGAM(terms='auto', fit_intercept=False) mod = gam.gridsearch(self.Xtrain.values, self.ytrain.values, \ lam=np.logspace(-3, 3, 11)) # Generate the model mod.summary() # Pseudo-R2: 0.6449 ypred = mod.predict(self.Xtest) MSE2 = np.mean((self.ytest - ypred.reshape(-1, 1))**2).values if self.plot: plt.plot(ypred.reshape(-1, 1), label='GAM model') plt.plot(self.ytest, label='Testing Data') plt.legend() plt.title("GAM model with spline terms") plt.ylabel("FFVC score") plt.xlabel("Sample no.") plt.show() print(f"Linear GAM produced MSE={MSE1},"+"\n"\ f"Spline addition produced MSE={MSE2}") """Save these values for Exercise 7.""" self.GAM1E1P5 = MSE1[0] self.GAM2E1P5 = MSE2[0] return 1
def pspline(time, flux): try: from pygam import LinearGAM, s except: raise ImportError('Could not import pygam') newflux = flux.copy() newtime = time.copy() detrended_flux = flux.copy() for i in range(constants.PSPLINES_MAXITER): mask_outliers = numpy.ma.where( 1 - detrended_flux < constants.PSPLINES_STDEV_CUT * numpy.std(detrended_flux)) newtime, newflux = cleaned_array(newtime[mask_outliers], newflux[mask_outliers]) gam = LinearGAM(s(0, n_splines=constants.PSPLINES_MAX_SPLINES)) search_gam = gam.gridsearch(newtime[:, numpy.newaxis], newflux, progress=False) trend = search_gam.predict(newtime) detrended_flux = newflux / trend stdev = numpy.std(detrended_flux) mask_outliers = numpy.ma.where( 1 - detrended_flux > constants.PSPLINES_STDEV_CUT * numpy.std(detrended_flux)) print('Iteration:', i + 1, 'Rejected outliers:', len(mask_outliers[0])) # Check convergence if len(mask_outliers[0]) == 0: print('Converged.') break # Final iteration, applied to unclipped time series (interpolated over clipped values) mask_outliers = numpy.ma.where( 1 - detrended_flux < constants.PSPLINES_STDEV_CUT * stdev) newtime, newflux = cleaned_array(newtime[mask_outliers], newflux[mask_outliers]) gam = LinearGAM(s(0, n_splines=constants.PSPLINES_MAX_SPLINES)) search_gam = gam.gridsearch(newtime[:, numpy.newaxis], newflux, progress=False) trend = search_gam.predict(time) return trend
def GAM_model(df, feature_list): X_train = df[feature_list] y_train = df[['logerror']] scaler = MinMaxScaler(copy=True, feature_range=(0, 1)).fit(X_train) X_scaled = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns.values).set_index( [X_train.index.values]) X_scaled = X_scaled.to_numpy() y_train = y_train.to_numpy() from pygam import LinearGAM, s, f, te gam = LinearGAM(s(0) + s(1) + s(2) + s(3) + s(4) + s(5)) gam.gridsearch(X_scaled, y_train) y_pred = gam.predict(X_scaled) y_pred = pd.DataFrame(y_pred) y_pred['actual'] = y_train y_pred.columns = ['predicted', 'actual'] RMSE = float('{:.3f}'.format( sqrt(mean_squared_error(y_pred.actual, y_pred.predicted)))) R2 = float('{:.3f}'.format(r2_score(y_pred.actual, y_pred.predicted))) return RMSE, R2, gam
def fit(self): S = s(0) if self.feature_names[0] in self.numerical_features else f(0) for i in range(1, len(self.feature_names)): if self.feature_names[i] in self.numerical_features: S += s(i) else: S += f(i) if self.mode == 'regression': gam = LinearGAM(S) gam.gridsearch(self.X_train, self.y_train) self._is_fitted = True self.explainer = gam elif self.mode == 'classification': gam = LogisticGAM(S) gam.gridsearch(np.array(self.X_train), self.y_train) self._is_fitted = True self.explainer = gam else: raise NameError( 'ERROR: mode should be regression or classification')
def find_parameters_evaluation(index_set, gene_expression, cell_count_aa): prediction = [] actual_value = [] n_splines_all = [] lam_all = [] # THIS IS OUTER LOOP: for VALIDATION/TESTING #train n models and evaluate their average performance gene_indexes = index_set y = cell_count_aa X = gene_expression[gene_expression.columns[gene_indexes]] gam = LinearGAM() kf = KFold(n_splits=10) for train_index, test_index in kf.split(X): X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y[train_index], y[test_index] gam = gam.gridsearch(X_train, y_train, n_splines=np.arange(15, 35), lam=[0.5, 0.6, 0.7]) n_splines_all.append(gam.n_splines) lam_all.append(gam.lam) lams = np.array(lam_all) lams_mean = lams.mean() n_splines_all = np.array(n_splines_all) n_splines_mean = n_splines_all.mean() gam = LinearGAM(n_splines=n_splines_mean, lam=lams_mean) loo = LeaveOneOut() for train_index, test_index in loo.split(X): X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y[train_index], y[test_index] regr = gam.fit(X_train, y_train) prediction_val = regr.predict(X_test)[0] prediction.append(prediction_val) actual_value.append(y_test[0]) print(test_index) print(str(prediction_val), " ", str(y_test[0])) #calculate spearman correlation over all of the models rho, pval = spearmanr(actual_value, prediction) return lams_mean, n_splines_mean, rho, pval
def find_parameters_evaluation(index_set, gene_expression, cell_count_aa): prediction = [] actual_value = [] n_splines_all = [] lam_all = [] # THIS IS OUTER LOOP: for VALIDATION/TESTING #train n models and evaluate their average performance gene_indexes = index_set y = cell_count_aa X = gene_expression[gene_expression.columns[gene_indexes]] loo = LeaveOneOut() loo.get_n_splits(X) gam = LinearGAM() gam = gam.gridsearch(X, y, n_splines=np.arange(10, 50), lam=[0.4, 0.5, 0.6, 0.7, 0.8]) for train_index, test_index in loo.split(X): X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y[train_index], y[test_index] # THIS IS INNER LOOP: for TRAINING/VALIDATION #train model with given optimized parameters regr = gam.fit(X_train, y_train) #make a prediction on OUTER LOOP test set prediction_val = regr.predict(X_test)[0] # store predictions and actual values prediction.append(prediction_val) actual_value.append(y_test[0]) # add optimal parameter values to arrays n_splines_all.append(regr.n_splines) lam_all.append(regr.lam) print(test_index) print(str(prediction_val), " ", str(y_test[0])) #calculate spearman correlation over all of the models rho, pval = spearmanr(actual_value, prediction) lams = np.array(lam_all) lams_mean = lams.mean() n_splines_all = np.array(n_splines_all) n_splines_mean = n_splines_all.mean() return lams_mean, n_splines_mean, rho, pval
def AAM(): gam = LinearGAM(s(0, n_splines=25, spline_order=3, constraints='concave', penalties = 'auto', basis = 'cp', edge_knots=[147, 147]) + l(3) # the last travel time + te(0, 1) # distance and departure_time + te(2, 0) # distance and isWeekend + l(2), # isWeekend fit_intercept=True) print(gam.gridsearch(X1, y1).summary()) # print(gam.gridsearch(X1,y1).get_params(deep=True)) '''plt.scatter(X1[:,0][0:56], y1[0:56], s=3, linewidth=1, label = 'data') plt.plot(X1[:,0][0:56], gam.predict(X1[0:56]), color = 'red', linewidth = 1, label = 'prediction') plt.legend() plt.title('Extended Additive Model') plt.show()''' # error calculation rmse_val = rmse(np.array(y1), np.array(gam.predict(X1))) print("RMSE is: "+str(rmse_val)) mae = mean_absolute_error(y1, gam.predict(X1)) print("MAE is: "+str(mae)) mape = mean_absolute_percentage_error(np.array(y1), np.array(gam.predict(X1))) print("MAPE is: "+ str(mape))
# ![Captura%20de%20pantalla%202021-01-11%20a%20las%2017.05.01.png](attachment:Captura%20de%20pantalla%202021-01-11%20a%20las%2017.05.01.png) # #### Construimos el modelo: # In[8]: model = LinearGAM(n_splines=10) # * Ajustamos el modelo a nuestra base de datos de entrenamiento: # In[9]: model.gridsearch(X_train, y_train) # #### Predicción # In[10]: #Predicción del modelo y_pred_validation = model.predict(X_validation) y_pred_validation # #### Evaluación de nuestro modelo:
y = loan3000[outcome] loan_tree = DecisionTreeClassifier(random_state=1, criterion='entropy', min_impurity_decrease=0.003) loan_tree.fit(X, y) loan_lda = LinearDiscriminantAnalysis() loan_lda.fit(X, y) logit_reg = LogisticRegression(penalty="l2", solver='liblinear') logit_reg.fit(X, y) ## model gam = LinearGAM(s(0) + s(1)) print(gam.gridsearch(X.values, [1 if yi == 'default' else 0 for yi in y])) models = { 'Decision Tree': loan_tree, 'Linear Discriminant Analysis': loan_lda, 'Logistic Regression': logit_reg, 'Generalized Additive Model': gam, } fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(5, 5)) xvalues = np.arange(0.25, 0.73, 0.005) yvalues = np.arange(-0.1, 20.1, 0.1) xx, yy = np.meshgrid(xvalues, yvalues) X = np.c_[xx.ravel(), yy.ravel()]
from pygam.datasets import wage from pygam import LinearGAM, s, f import numpy as np import matplotlib.pyplot as plt X, y = wage() gam = LinearGAM(s(0, n_splines=5) + s(1) + f(2)).fit(X, y) gam.summary() lam = np.logspace(-3, 5, 5) lams = [lam] * 3 gam.gridsearch(X, y, lam=lams) gam.summary() lams = np.random.rand(100, 3) # random points on [0, 1], with shape (100, 3) lams = lams * 8 - 3 # shift values to -3, 3 lams = np.exp(lams) # transforms values to 1e-3, 1e3 random_gam = LinearGAM(s(0) + s(1) + f(2)).gridsearch(X, y, lam=lams) random_gam.summary() print(gam.statistics_['GCV'] < random_gam.statistics_['GCV']) for i, term in enumerate(gam.terms): if term.isintercept: continue XX = gam.generate_X_grid(term=i)
class GAMEnsemble(EnsembleModel): """Implements GAM ensemble in [1].""" def __init__(self, nonlinear_ensemble=False, residual_process=True): """ Initializer. Args: nonlinear_ensemble: (bool) Whether use nonlinear term to transform base model. residual_process: (bool) Whether model residual process. """ model_name = ( "Generalized Additive Ensemble" if residual_process else "{} Stacking".format("Nonlinear" if nonlinear_ensemble else "Linear")) super().__init__(model_name) self.gam_model = None self.nonlinear_ensemble = nonlinear_ensemble self.model_residual = residual_process def train(self, X, y, base_pred): """Trains ensemble model based on data and base predictions. Adds value to class attribute "model_weight" Args: X: (np.ndarray) Training features, shape (N, D) y: (np.ndarray) Training labels, shape (N, 1) base_pred: (dict of np.ndarray) Dictionary of base model predictions With keys (str) being model name, and values (np.ndarray) being predictions corresponds to X and y. """ # build feature and gam terms ens_feature, feature_terms = self._build_ensemble_feature(X, base_pred) # define model self.gam_model = LinearGAM(feature_terms) # additional fine-tuning lam_grid = self._build_lambda_grid(n_grid=100) self.gam_model.gridsearch(X=ens_feature, y=y, lam=lam_grid, progress=False) def predict(self, X, base_pred): """Predicts label based on feature and base model. Args: X: (np.ndarray) Training features, shape (N, D) base_pred: (dict of np.ndarray) Dictionary of base model predictions With keys (str) being model name, and values (np.ndarray) being predictions corresponds to X and y. Returns: (np.ndarray) ensemble prediction and variance Raises: (ValueError) If self.model_weight is empty. """ if not self.gam_model: raise ValueError("Attribute gam_model empty." "Model was not trained properly.") # build feature and gam terms ens_feature, _ = self._build_ensemble_feature(X, base_pred) # prediction prediction = self.gam_model.predict(ens_feature) prediction_var = ((self.gam_model.prediction_intervals( ens_feature, width=.95)[:, 1] - prediction) / 2) ** 2 return prediction, prediction_var def _build_ensemble_feature(self, X, base_pred): """Builds featurre array and corresponding GAM TermList. Terms corresponding to X will be summation of dimension-wise splines, plus a tensor-product term across all dimension. """ ensemble_term_func = s if self.nonlinear_ensemble else l ens_feature = np.asarray(list(base_pred.values())).T term_list = [ensemble_term_func(dim_index) for dim_index in range(ens_feature.shape[1])] # optionally, add residual process if self.model_residual: # build gam terms term_list += [s(dim_index) for dim_index in range(ens_feature.shape[1], ens_feature.shape[1] + X.shape[1])] if X.shape[1] > 1: term_list += [te(*list(ens_feature.shape[1] + np.array(range(X.shape[1]))))] # update features ens_feature = np.concatenate([ens_feature, X], axis=1) gam_feature_terms = TermList(*term_list) return ens_feature, gam_feature_terms def _build_lambda_grid(self, n_grid=100): # count actual number of terms in each nonlinear term # (e.g. te(0, 1) will actually have two terms) n_terms = np.sum([len(model_term._terms) if model_term.istensor else 1 for model_term in self.gam_model.terms]) lam = np.random.rand(n_grid, n_terms) # rescale to between (0, 1) lam_norm = (lam - np.min(lam)) / (np.max(lam) - np.min(lam)) return np.exp((lam_norm - 0.5) * 6)
import numpy as np from plotly import tools import plotly.offline as py import plotly.graph_objs as go # Prep the dataset data = pd.read_csv( "/home/dusty/Econ8310/DataSets/HappinessWorld.csv") # Generate x and y matrices eqn = """happiness ~ -1 + freedom + family + year + economy + health + trust""" y,x = pt.dmatrices(eqn, data=data) # Initialize and fit the model gam = LinearGAM(s(0) + s(1) + s(2) + s(3) + s(4) + s(5)) gam = gam.gridsearch(np.asarray(x), y) # Specify plot shape titles = ['freedom', 'family', 'year', 'economy', 'health', 'trust'] fig = tools.make_subplots(rows=2, cols=3, subplot_titles=titles) fig['layout'].update(height=800, width=1200, title='pyGAM', showlegend=False) for i, title in enumerate(titles): XX = gam.generate_X_grid(term=i) pdep, confi = gam.partial_dependence(term=i, width=.95) trace = go.Scatter(x=XX[:,i], y=pdep, mode='lines', name='Effect') ci1 = go.Scatter(x = XX[:,i], y=confi[:,0], line=dict(dash='dash', color='grey'), name='95% CI') ci2 = go.Scatter(x = XX[:,i], y=confi[:,1], line=dict(dash='dash', color='grey'), name='95% CI') if i<3:
partialResidualPlot(result_spline, house_98105, 'AdjSalePrice', 'SqFtTotLiving', ax) plt.tight_layout() plt.show() ### Generalized Additive Models predictors = ['SqFtTotLiving', 'SqFtLot', 'Bathrooms', 'Bedrooms', 'BldgGrade'] outcome = 'AdjSalePrice' X = house_98105[predictors].values y = house_98105[outcome] ## model gam = LinearGAM(s(0, n_splines=12) + l(1) + l(2) + l(3) + l(4)) gam.gridsearch(X, y) print(gam.summary()) fig, axes = plt.subplots(figsize=(8, 8), ncols=2, nrows=3) titles = ['SqFtTotLiving', 'SqFtLot', 'Bathrooms', 'Bedrooms', 'BldgGrade'] for i, title in enumerate(titles): ax = axes[i // 2, i % 2] XX = gam.generate_X_grid(term=i) ax.plot(XX[:, i], gam.partial_dependence(term=i, X=XX)) ax.plot(XX[:, i], gam.partial_dependence(term=i, X=XX, width=.95)[1], c='r', ls='--') ax.set_title(titles[i])
import pickle timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S') X = pd.read_pickle("data/processed/X.pickle") y = pd.read_pickle("data/processed/y.pickle") print('Read data.') lams = np.random.rand(150000, 4) * 8 - 3 lams = np.exp(lams) # randomized grid search print('Initialized Linear GAM.') gam_grid = LinearGAM(s(0) + s(1) + s(2) + s(3)) print("Grid searching Linear GAM's lambdas.") gam_grid.gridsearch(X, y, lam=lams) with open(f"models/{timestamp} {sys.argv[1]}.pickle", "wb") as handle: pickle.dump(gam_grid, handle) print('Serialized GAM as pickle.') print(gam_grid.summary()) # plotting plt.figure(figsize=(16, 16 / 1.618)) fig, axs = plt.subplots(1, 3) titles = ["pm10median", "time", "tmpd"] for i, ax in enumerate(axs): XX = gam_grid.generate_X_grid(term=i) ax.plot(XX[:, i], gam_grid.partial_dependence(term=i, X=XX))
#'log' #'inverse-squared' from pygam.datasets import wage import matplotlib.pyplot as plt X, y = wage(return_X_y=True) #X[0] es el año X[0] = 0 es 2000?... #X[1] es la edad de la persona #X[2] es su nivel de estudios, 0 = basica, 1=media superior, 2 = universidad, 3= posgrado #y ingresos $$ ## model gam1 = LinearGAM(s(0) + s(1) + f(2), fit_intercept=False) gam1.gridsearch(X, y) ## plotting plt.figure(figsize=(10, 7.5)) fig, axs = plt.subplots(1, 3) titles = ['year', 'age', 'education'] for i, ax in enumerate(axs): XX = gam1.generate_X_grid(term=i) ax.plot(XX[:, i], gam1.partial_dependence(term=i, X=XX)) ax.plot(XX[:, i], gam1.partial_dependence(term=i, X=XX, width=.95)[1], c='r', ls='--') ax.set_title(titles[i]) plt.rcParams['figure.figsize'] = [10, 7.5]
class DeepModels: # Sequential 6 layer neural network def returnSequential6(self, idim = 20): model = Sequential() model.add(Dense(50, input_dim=idim, activation='relu')) model.add(Dense(40, activation='relu')) model.add(Dense(30, activation='relu')) model.add(Dense(20, activation='relu')) model.add(Dense(10, activation='relu')) model.add(Dense(1, activation='linear')) model.compile(optimizer='Adam', loss='mean_absolute_error') return model def returnSequential6_regularized(self, idim = 20): model = Sequential() model.add(Dense(50, input_dim=idim, activation='relu')) model.add(Dense(40, activation='relu')) model.add(Dense(30, activation='relu')) model.add(Dense(20, activation='relu')) model.add(Dense(10, activation='relu')) model.add(Dense(1, activation='linear', kernel_regularizer=regularizers.l1_l2(l1=0.01, l2=0.01))) model.compile(optimizer='Adam', loss='mean_absolute_error') return model def returnSequential9(self, idim = 20): model = Sequential() model.add(Dense(80, input_dim = idim, activation='relu')) model.add(Dense(70, activation='relu')) model.add(Dense(60, activation='relu')) model.add(Dense(50, activation='relu')) model.add(Dense(40, activation='relu')) model.add(Dense(30, activation='relu')) model.add(Dense(20, activation='relu')) model.add(Dense(10, activation='relu')) model.add(Dense(1, activation='linear')) model.compile(optimizer='Adam', loss='mean_absolute_error') return model def returnSequential15(self, idim = 20): model = Sequential() model.add(Dense(140, input_dim=idim, activation='relu')) model.add(Dense(130, activation='relu')) model.add(Dense(120, activation='relu')) model.add(Dense(110, activation='relu')) model.add(Dense(100, activation='relu')) model.add(Dense(90, activation='relu')) model.add(Dense(80, activation='relu')) model.add(Dense(70, activation='relu')) model.add(Dense(60, activation='relu')) model.add(Dense(50, activation='relu')) model.add(Dense(40, activation='relu')) model.add(Dense(30, activation='relu')) model.add(Dense(20, activation='relu')) model.add(Dense(10, activation='relu')) model.add(Dense(1, activation='linear')) model.compile(optimizer='Adam', loss='mean_absolute_error') return model def returnSequential15_regularized(self, idim = 20): model = Sequential() model.add(Dense(140, input_dim=idim, activation='relu')) model.add(Dense(130, activation='relu')) model.add(Dense(120, activation='relu')) model.add(Dense(110, activation='relu')) model.add(Dense(100, activation='relu')) model.add(Dense(90, activation='relu')) model.add(Dense(80, activation='relu')) model.add(Dense(70, activation='relu')) model.add(Dense(60, activation='relu')) model.add(Dense(50, activation='relu')) model.add(Dense(40, activation='relu')) model.add(Dense(30, activation='relu')) model.add(Dense(20, activation='relu')) model.add(Dense(10, activation='relu')) model.add(Dense(1, activation='linear')) model.compile(optimizer='Adam', loss='mean_absolute_error') return model def returnSequential21(self, idim = 20): model = Sequential() model.add(Dense(200, input_dim=idim, activation='relu')) model.add(Dense(190, activation='relu')) model.add(Dense(180, activation='relu')) model.add(Dense(170, activation='relu')) model.add(Dense(160, activation='relu')) model.add(Dense(150, activation='relu')) model.add(Dense(140, activation='relu')) model.add(Dense(130, activation='relu')) model.add(Dense(120, activation='relu')) model.add(Dense(110, activation='relu')) model.add(Dense(100, activation='relu')) model.add(Dense(90, activation='relu')) model.add(Dense(80, activation='relu')) model.add(Dense(70, activation='relu')) model.add(Dense(60, activation='relu')) model.add(Dense(50, activation='relu')) model.add(Dense(40, activation='relu')) model.add(Dense(30, activation='relu')) model.add(Dense(20, activation='relu')) model.add(Dense(10, activation='relu')) model.add(Dense(1, activation='linear')) model.compile(optimizer='Adam', loss='mean_absolute_error') return model def RNN(self, idim = 20): model = Sequential() model.add(SimpleRNN(10, input_dim=idim)) model.add(Dense(1, activation='linear')) model.compile(optimizer='Adam', loss='mean_absolute_error') return model def multi_RNN(self, idim = 20): model = Sequential() model.add(SimpleRNN(14, input_dim=idim, activation='relu')) model.add(Dense(7, activation='relu')) model.add(Dense(1, activation='linear')) model.compile(optimizer='Adam', loss='mean_absolute_error') return model def multi_RNN2(self, idim = 20): model = Sequential() model.add(SimpleRNN(40, input_dim=idim)) model.add(Dense(30, activation='relu')) model.add(Dense(20, activation='relu')) model.add(Dense(10, activation='relu')) model.add(Dense(1, activation='linear')) model.compile(optimizer='Adam', loss='mean_absolute_error') return model def baseline(self, idim=20): # Create model model = Sequential() model.add(Dense(20, input_dim=idim, activation='relu')) model.add(Dense(1, activation='sigmoid')) # Compile model model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['mean_absolute_error']) return model def lstm(self, idim = 20): model = Sequential() model.add(LSTM(20, input_dim=idim)) model.add(Dense(10, activation='relu')) model.add(Dense(1, activation='linear')) model.compile(loss='mean_absolute_error', optimizer='adam') return model def multi_lstm(self, idim = 20): model = Sequential() model.add(LSTM(14, input_dim=idim, activation='relu')) model.add(Dense(7, input_dim=idim, activation='relu')) model.add(Dense(1, activation='linear')) model.compile(loss='mean_absolute_error', optimizer='adam') return model # Sequential 4 layer neural network def returnSequential4(self, idim = 20): model = Sequential() model.add(Dense(20, activation='relu', input_dim=idim)) model.add(Dense(units=15, activation='relu')) model.add(Dense(units=10, activation='relu')) model.add(Dense(units=5, activation='relu')) model.add(Dense(units=1, activation='linear')) model.compile(optimizer='Adam', loss='mean_absolute_error') return model # Sequential 4 layer neural network def returnSequential8(self, idim=20): model = Sequential() model.add(Dense(70, activation='relu', input_dim=idim)) model.add(Dense(units=60, activation='relu')) model.add(Dense(units=50, activation='relu')) model.add(Dense(units=40, activation='relu')) model.add(Dense(units=30, activation='relu')) model.add(Dense(units=20, activation='relu')) model.add(Dense(units=10, activation='relu')) model.add(Dense(units=1, activation='linear', kernel_regularizer=regularizers.l1_l2(l1=0.01, l2=0.01))) model.compile(optimizer='Adam', loss='mean_absolute_error') return model def base(self, idim=20): model = Sequential() model.add(Dense(10, activation='relu', input_dim=idim)) model.add(Dense(1, activation='linear')) model.compile(optimizer='Adam', loss='mean_absolute_error') return model def base2(self, idim=20): model = Sequential() model.add(Dense(14, activation='relu', input_dim=idim)) model.add(Dense(7, activation='relu', input_dim=idim)) model.add(Dense(1, activation='linear')) model.compile(optimizer='Adam', loss='mean_absolute_error') return model def __init__(self, m, idim=20): if m == 0: self.model = self.base(idim) self.type = 2 elif m == 1: self.model = self.base2(idim) self.type = 2 elif m == 2: self.model = self.returnSequential4(idim) self.type = 2 elif m == 3: self.model = self.returnSequential8(idim) self.type = 2 elif m == 4: self.model = self.returnSequential15_regularized(idim) self.type = 2 elif m == 5: self.model = self.multi_RNN(idim) self.type = 1 elif m == 6: self.model = self.multi_lstm(idim) self.type = 1 elif m == 7: self.model = LinearGAM() self.type = 3 elif m == 8: self.model = self.RNN(idim) self.type = 1 elif m == 9: self.model = self.lstm(idim) self.type = 1 def returnModel(self): return self.model def train(self, X, y, bs=10, epochs=100): if self.type == 1: X = np.reshape(X, (X.shape[0], 1, X.shape[1])) if self.type == 3: self.model.gridsearch(X,y) else: self.model.fit(X, y, batch_size = bs, epochs = epochs, shuffle=True, verbose = 0) def prediction(self, X): if self.type == 1: X = np.reshape(X, (X.shape[0], 1, X.shape[1])) return self.model.predict(X) def cross_eval_with_plotting(self, city, X,y,bs=10,ep=100, k=3): scores = [] multiplier = 0 fig10, ax10 = plt.subplots() if self.type == 0: kf = KFold(n_splits=k, shuffle=False, random_state=0) for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] self.model.fit(X_train, y_train, batch_size=bs, epochs=ep, verbose=0) a, score = self.model.evaluate(X_test, y_test, verbose=0) predictions = self.model.predict(X_test) plt.plot(range(len(y_test) * multiplier, len(y_test) + len(y_test) * multiplier), y_test, 'm', alpha=0.4) plt.plot(range(len(y_test) * multiplier, len(y_test) + len(y_test) * multiplier), predictions, 'g') scores.append(score) multiplier = multiplier + 1 plt.title('True vs. Predicted Cases {}'.format(city)) plt.xlabel('Week') plt.ylabel('Cases of Dengue') plt.legend(['True', 'Predicted']) plt.show() return sum(scores) / len(scores) elif self.type == 1: kf = KFold(n_splits=k, shuffle=False, random_state=0) scores = [] multiplier = 0 fig10, ax10 = plt.subplots() for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1])) X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1])) self.model.fit(X_train, y_train, batch_size=bs, epochs=ep, verbose=0) predictions = self.model.predict(X_test) plt.plot(range(len(y_test)*multiplier, len(y_test) + len(y_test)*multiplier), y_test, 'm', alpha=0.4) plt.plot(range(len(y_test)*multiplier, len(y_test) + len(y_test)*multiplier), predictions, 'g') score = self.model.evaluate(X_test, y_test, verbose=0) scores.append(score) multiplier = multiplier + 1 plt.title('True vs. Predicted Cases in {}'.format(city)) plt.xlabel('Week') plt.ylabel('Cases of Dengue') plt.legend(['True', 'Predicted']) plt.show() return sum(scores) / len(scores) elif self.type == 2: multiplier = 0 fig10, ax10 = plt.subplots() kf = KFold(n_splits=k, shuffle=False, random_state=0) for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] self.model.fit(X_train, y_train, batch_size=10, epochs=300, verbose=0) predictions = self.model.predict(X_test) plt.plot(range(len(y_test) * multiplier, len(y_test) + len(y_test) * multiplier), y_test, 'm', alpha=0.4) plt.plot(range(len(y_test) * multiplier, len(y_test) + len(y_test) * multiplier), predictions, 'g') score = self.model.evaluate(X_test, y_test, verbose=0) scores.append(score) multiplier = multiplier + 1 plt.title('True vs. Predicted Cases in {}'.format(city)) plt.xlabel('Week') plt.ylabel('Cases of Dengue') plt.legend(['True', 'Predicted']) plt.show() return sum(scores) / len(scores) elif self.type == 3: multiplier = 0 fig10, ax10 = plt.subplots() kf = KFold(n_splits=k, shuffle=False, random_state=0) for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] self.model.gridsearch(X_train, y_train) y_pre = self.model.predict(X_test) plt.plot(range(len(y_test) * multiplier, len(y_test) + len(y_test) * multiplier), y_test, 'm', alpha=0.4) plt.plot(range(len(y_test) * multiplier, len(y_test) + len(y_test) * multiplier), y_pre, 'g') scores.append(mean_absolute_error(y_pre, y_test)) plt.title('True vs. Predicted Cases in {}'.format(city)) plt.xlabel('Week') plt.ylabel('Cases of Dengue') plt.legend(['True', 'Predicted']) plt.show() return sum(scores) / len(scores) def cross_eval(self, X, y, bs=10, ep=100, k=3): scores = [] if self.type == 0: kf = KFold(n_splits=k, shuffle=True, random_state=0) for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] self.model.fit(X_train, y_train, batch_size=bs, epochs=ep, verbose=0) a, score = self.model.evaluate(X_test, y_test, verbose=0) scores.append(score) return sum(scores) / len(scores) elif self.type == 1: kf = KFold(n_splits=k, shuffle=False, random_state=0) scores = [] for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1])) X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1])) self.model.fit(X_train, y_train, batch_size=bs, epochs=ep, verbose=0) score = self.model.evaluate(X_test, y_test, verbose=0) scores.append(score) return sum(scores) / len(scores) elif self.type == 2: kf = KFold(n_splits=k, shuffle=True, random_state=0) for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] self.model.fit(X_train, y_train, batch_size=10, epochs=300, verbose=0) score = self.model.evaluate(X_test, y_test, verbose=0) scores.append(score) return sum(scores) / len(scores) elif self.type == 3: kf = KFold(n_splits=k, shuffle=False, random_state=0) for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] self.model.gridsearch(X_train, y_train) y_pre = self.model.predict(X_test) print(y_pre) scores.append(mean_absolute_error(y_pre, y_test)) return sum(scores) / len(scores)
def EAM(): X = np.load('EAM_factors.npy') y = np.load('EAM_time.npy') gam = LinearGAM( s(0, spline_order=3) + s(1, spline_order=3) + te(0, 1) + te(0, 2)) gam.gridsearch(X, y)
df_system.head() # Get some data # In[4]: metric = 'jd' X = df_pheno.loc[:, 'ageAtScan1_Years'] Y = df_system.loc[:, metric] # Estimate GAM with spline # In[5]: gam = LinearGAM(s(0)).fit(X, Y) gam.gridsearch(X, Y) # Plot # In[6]: XX = gam.generate_X_grid(term=0) pdep, confi = gam.partial_dependence(term=0, X=XX, width=0.95) plt.figure() plt.plot(XX, pdep) # fit plt.plot(XX, confi, c='r', ls='--') # confidence interval plt.plot(XX, gam.prediction_intervals(XX, width=.95), color='b', ls='--') # 95% prediction interval plt.scatter(X, Y, facecolor='gray', edgecolors='none', alpha=0.5) # data plt.xlabel('Age')
def calculate_gene_trends(session_ID, list_of_genes, branch_ID): n_steps = 2 + len(list_of_genes) #uns = cache_adata(session_ID, group="uns") obs = cache_adata(session_ID, group="obs") cache_progress(session_ID, progress=int(1 / n_steps * 100)) if (branch_ID == -1): branch_probs = None else: branch_probs = obs["pseudotime_branch_" + str(branch_ID)] pseudotime = obs["pseudotime"] cache_progress(session_ID, progress=int(2 / n_steps * 100)) if ((branch_ID == -1) or (branch_probs is None)): cells_in_branch = obs.index else: cells_in_branch = obs[obs["pseudotime_branch_" + str(branch_ID)] > 0.2].index print("[DEBUG] branch: " + str(branch_ID)) ''' gene_trends = palantir.presults.compute_gene_trends(pr_res, imp_df.loc[:, genes], lineages = [branch], n_jobs=1) ''' X_train = pseudotime.to_numpy() # reduce the number of data points we fit to save computation time max_samples_to_fit = 5000 if (len(X_train) <= max_samples_to_fit): subsample_mask = np.ones_like(X_train) else: subsample_mask = np.zeros_like(X_train) subsample_mask[0:max_samples_to_fit] = 1 np.random.shuffle(subsample_mask) subsample_mask = np.array(subsample_mask, dtype=bool) if ((branch_ID != -1) and not (branch_probs is None)): weights = branch_probs.to_numpy() else: weights = np.ones_like(X_train) X_train = X_train[subsample_mask] weights = weights[subsample_mask] X_train = np.reshape(X_train, (len(X_train), 1)) weights = np.reshape(weights, (len(weights), 1)) X_plot = np.linspace(np.min(obs["pseudotime"][cells_in_branch]), np.max(obs["pseudotime"][cells_in_branch]), 125) gene_trends = pd.DataFrame() gene_trends["pseudotime"] = X_plot step_number = 3 for gene in list_of_genes: #Y_train = adata.obs_vector(gene, layer="imputed") time_0 = datetime.now() Y_train = get_obs_vector(session_ID, gene, layer="imputed") print("[BENCH] time for get_obs_vector: " + str(datetime.now() - time_0)) Y_train = Y_train[subsample_mask] gam = LinearGAM(n_splines=5, spline_order=3) time_0 = datetime.now() gam.gridsearch(X_train, Y_train, weights=weights, progress=False) print("[BENCH] time for gam fit: " + str(datetime.now() - time_0)) #gam = ExpectileGAM(terms="s(0)", expectile=0.5).gridsearch(X_train, Y_train) #lam = gam.lam #gam_upper = ExpectileGAM(expectile=0.75, lam=lam).fit(X_train, Y_train) #gam_lower = ExpectileGAM(expectile=0.25, lam=lam).fit(X_train, Y_train) gene_trends[gene] = gam.predict(X_plot) #gene_trends[gene + "_ci_upper"] = gam_upper.predict(X_plot) #gene_trends[gene + "_ci_lower"] = gam_lower.predict(X_plot) ci = gam.confidence_intervals(X_plot, width=.95) gene_trends[gene + "_ci_upper"] = ci[:, 1] gene_trends[gene + "_ci_lower"] = ci[:, 0] cache_progress(session_ID, progress=int(step_number / n_steps * 100)) step_number += 1 gene_trends = gene_trends.clip(lower=0) return gene_trends