def run_gam_effective_r_from_empirical(state_data, n_splines=25, algo=GammaGAM, n_bootstrap=100): # for numerical stability epsilon = 1 R_series = ( state_data['confirmed_new'] / state_data['confirmed_total'].shift(1)).dropna() * 1 / RECOVERY_RATE X = np.arange(R_series.shape[0]) y = R_series.values + epsilon # running GAM in bootstrap bootstrap = [] for _ in range(n_bootstrap): weights = dirichlet([1] * R_series.shape[0]).rvs(1) gam = algo(s(0, n_splines) + l(0)) gam.fit(X, y, weights=weights[0]) bootstrap.append(gam) preds = pd.DataFrame([m.predict(X) - epsilon for m in bootstrap]).T estimate_rt = pd.DataFrame(index=R_series.index) estimate_rt['ML'] = preds.mean(axis=1).values estimate_rt['Low_90'] = preds.quantile(0.05, axis=1).values estimate_rt['High_90'] = preds.quantile(0.95, axis=1).values return estimate_rt.dropna()
def fit_gam_with_fix_dof(X, Y, dof): ##{{{ lam_up = 1e2 lam_lo = 1e-2 tol = 1e-2 diff = 1. + tol n_splines = int(dof + 2) nit = 0 while diff > tol: lam = (lam_up + lam_lo) / 2. gam_model = pg.LinearGAM( pg.s(0, n_splines=n_splines, penalties="auto", lam=lam) + pg.l(1, penalties=None)) gam_model.fit(X, Y) current_dof = gam_model.statistics_["edof"] if current_dof < dof: lam_up = lam else: lam_lo = lam diff = np.abs(dof - current_dof) nit += 1 if nit % 100 == 0: lam_up = 1e2 lam_lo = 1e-2 n_splines += 1 return gam_model
def AAM(): gam = LinearGAM(s(0, n_splines=25, spline_order=3, constraints='concave', penalties = 'auto', basis = 'cp', edge_knots=[147, 147]) + l(3) # the last travel time + te(0, 1) # distance and departure_time + te(2, 0) # distance and isWeekend + l(2), # isWeekend fit_intercept=True) print(gam.gridsearch(X1, y1).summary()) # print(gam.gridsearch(X1,y1).get_params(deep=True)) '''plt.scatter(X1[:,0][0:56], y1[0:56], s=3, linewidth=1, label = 'data') plt.plot(X1[:,0][0:56], gam.predict(X1[0:56]), color = 'red', linewidth = 1, label = 'prediction') plt.legend() plt.title('Extended Additive Model') plt.show()''' # error calculation rmse_val = rmse(np.array(y1), np.array(gam.predict(X1))) print("RMSE is: "+str(rmse_val)) mae = mean_absolute_error(y1, gam.predict(X1)) print("MAE is: "+str(mae)) mape = mean_absolute_percentage_error(np.array(y1), np.array(gam.predict(X1))) print("MAPE is: "+ str(mape))
def fit_gam(series, n_splines=25, algo=PoissonGAM, n_bootstrap=100): X = np.arange(series.shape[0]) y = series.values # running GAM in bootstrap bootstrap = [] for _ in range(n_bootstrap): weights = dirichlet([1] * series.shape[0]).rvs(1) gam = algo(s(0, n_splines) + l(0)) gam.fit(X, y, weights=weights[0]) bootstrap.append(gam) return bootstrap
def estimate_gam(series, n_splines=25, algo=PoissonGAM, n_bootstrap=100): X = np.arange(series.shape[0]) y = series.values # running GAM in bootstrap bootstrap = [] for _ in range(n_bootstrap): weights = dirichlet([1] * series.shape[0]).rvs(1) gam = algo(s(0, n_splines) + l(0)) gam.fit(X, y, weights=weights[0]) bootstrap.append(gam) preds = pd.DataFrame([m.predict(X) for m in bootstrap]).T return preds
def lingam(term='spline'): """ Method to load unfitted Generalized Additive Models models of type modelclass INPUT: term: 'linear', 'spline' or 'factor' RETURN: model """ if term is 'linear': regmod = LinearGAM(l(0)) # GAM with spline term elif term is 'spline': regmod = LinearGAM(s(0)) # GAM with factor term elif term is 'factor': regmod = LinearGAM(f(0)) else: raise ValueError('Given Gam term unknown') utils.display_get_params('LinearGAM Model Description', regmod.get_params()) return(regmod)
def get_GAM_predictions(Xtrain, Ytrain, Xtest): """ Perform grid search and train Linear GAM model and return predictions for the test set. :param Xtrain: X values for training. :param Ytrain: Y values for training. :param Xtest: X values for validation. :return: Predictions from Linear GAM model for test dataset """ # Create an array of lambda values to search lams = np.logspace(-3, 20, 35) # GAM search requires numpy arrays Xtrain_np = np.array(Xtrain, dtype=np.float64) Ytrain_np = np.array(Ytrain, dtype=np.float64) # Linear Generalised Additive Model model = LinearGAM( s(99) + s(100) + l(3) + l(6) + l(8) + l(11) + l(7) + l(9) + l(12) + l(10) + l(14) + l(29) + l(15) + l(71) + l(17) + l(21) + l(107) + l(16) + l(68) + l(78) + l(61) + l(55) + l(31) + l(13) + l(37) + l(4) + l(5) + l(2) + te(4, 5) + te(68, 78)).gridsearch(Xtrain_np, Ytrain_np, lam=lams) return model.predict(Xtest)
def GAM1(self): """Generalized Additive Model with possible non-linear effects. Specific variables are modelled by splines. Can the possible non-linearities be captured by adding polynomial terms to the linear model? Fit such a model and comment on the two solutions.""" from pygam import LinearGAM, s, l, f """Non-linear effects are modeled by splines. Analyze the summary table and declare which factors should be splined. Do this depending on the so-called significance code of the table.""" terms = l(0)+l(1)+l(2)+l(3)+l(4)+l(5)+l(6)+l(7)+l(8)+l(9)+l(10)+l(11)\ +l(12)+l(13)+l(14)+l(15)+l(16)+l(17)+l(18)+l(19)+l(20)+l(21)+l(22)\ +l(23) gam = LinearGAM(terms=terms, fit_intercept=False) mod = gam.gridsearch(self.Xtrain.values, self.ytrain.values, \ lam=np.logspace(-3, 3, 11)) # Generate the model mod.summary() # Pseudo-R2: 0.6449 ypred = mod.predict(self.Xtest) MSE1 = np.mean((self.ytest - ypred.reshape(-1, 1))**2).values if self.plot: plt.plot(ypred.reshape(-1, 1), label='GAM model') plt.plot(self.ytest, label='Testing Data') plt.legend() plt.title("GAM model with linear terms") plt.ylabel("FFVC score") plt.xlabel("Sample no.") plt.show() """Repeat the study adding the 'auto' function, adding splines and polynomial contributions.""" gam = LinearGAM(terms='auto', fit_intercept=False) mod = gam.gridsearch(self.Xtrain.values, self.ytrain.values, \ lam=np.logspace(-3, 3, 11)) # Generate the model mod.summary() # Pseudo-R2: 0.6449 ypred = mod.predict(self.Xtest) MSE2 = np.mean((self.ytest - ypred.reshape(-1, 1))**2).values if self.plot: plt.plot(ypred.reshape(-1, 1), label='GAM model') plt.plot(self.ytest, label='Testing Data') plt.legend() plt.title("GAM model with spline terms") plt.ylabel("FFVC score") plt.xlabel("Sample no.") plt.show() print(f"Linear GAM produced MSE={MSE1},"+"\n"\ f"Spline addition produced MSE={MSE2}") """Save these values for Exercise 7.""" self.GAM1E1P5 = MSE1[0] self.GAM2E1P5 = MSE2[0] return 1
plt.xticks([0,0.5,1]) plt.xlabel('fraction of random sets with\nvariance<variance(barcode control set)') f.savefig('./figures/Fig6/Fig6D_five_bccontrols_vs_randomdistribution_overview.png', \ dpi = 300, format='png', bbox_inches='tight', frameon=True) #%% ################ # Overview plots, relationship mean splicing values - noise ################ meanplusnoise=irdf[(irdf.smoothednumberofpeaks==1)&(irdf.number_reads>100)&(irdf.fraction_canonical>0.3)][['wav_stats','rnaperdna','noisestrengthlogwstd']].dropna() gaml=pygam.LinearGAM(pygam.s(0,lam=1, n_splines=10)+pygam.l(1,lam=1)).fit(meanplusnoise[['wav_stats','rnaperdna']], meanplusnoise.noisestrengthlogwstd) pred=gaml.predict(meanplusnoise[['wav_stats','rnaperdna']]) meanplusnoise['noisegampred']=pd.Series(pred, index=meanplusnoise.index) meanplusnoise['noiseresgam']=meanplusnoise['noisestrengthlogwstd']-meanplusnoise['noisegampred'] f=plt.figure(figsize=(4,4)) plt.scatter(meanplusnoise.wav_stats,\ meanplusnoise.noisestrengthlogwstd, s=10, alpha=0.2, color=sns.xkcd_rgb['medium blue']) plt.plot(meanplusnoise.wav_stats, meanplusnoise.noisegampred, '.', color=sns.xkcd_rgb['light green'], alpha=0.2, markersize=5) plt.xlabel('splicing value') plt.ylabel('splicing noise strength [log2]') plt.ylim(-9,3) plt.xlim(0,7.2)
def gam_decomposition_old_old_old(Xd, Enat, Sigma=None, time_center=None, gam_dof=7, verbose=False): ##{{{ """ NSSEA.gam_decomposition ======================= Perform the decomposition anthropic/natural forcing with GAM arguments --------- """ models = Xd.columns.to_list() n_models = Xd.shape[1] n_sample = Enat.shape[1] - 1 time = Xd.index.values n_time = time.size time_l = np.repeat(time[0], n_time) Eant = np.repeat(0., n_time) sample = ["be"] + ["S{}".format(i) for i in range(n_sample)] X = xr.DataArray(np.zeros((n_time, n_sample + 1, 3, n_models)), coords=[time, sample, ["all", "nat", "ant"], models], dims=["time", "sample", "forcing", "models"]) pb = ProgressBar("GAM decomposition", n_models * n_sample) for i in range(n_models): gam_model = pg.LinearGAM( pg.s(0, n_splines=gam_dof - 2, penalties=None) + pg.l(1, penalties=None)) gam_model.fit(np.stack((time, Enat.values[:, 0]), -1), Xd.values[:, i]) X.values[:, 0, 0, i] = gam_model.predict(np.stack((time, Enat.values[:, 0]), -1)) X.values[:, 0, 1, i] = gam_model.predict( np.stack((time_l, Enat.values[:, 0]), -1)) X.values[:, 0, 2, i] = gam_model.predict(np.stack((time, Eant), -1)) for j in range(n_sample): if verbose: pb.print() Xl = Enat.values[:, j + 1] mVt = np.stack((time, Xl), -1) mVl = np.stack((time_l, Xl), -1) ## GAM decomposition gam_model = pg.LinearGAM( pg.s(0, n_splines=gam_dof - 2, penalties=None) + pg.l(1, penalties=None)) gam_model.fit(mVt, Xd.values[:, i]) ## Coefficients of decomposition int_coef = gam_model.coef_[-1] lin_coef = gam_model.coef_[-2] spl_coef = gam_model.coef_[:-2] spl_mat = gam_model._modelmat(mVt).todense()[:, :-2] proj_mat = spl_mat @ np.linalg.inv(spl_mat.T @ spl_mat) @ spl_mat.T ## Noise of linear term sigma_lin = np.sqrt( (Xl.transpose() @ Sigma @ Xl) / (Xl.transpose() @ Xl)**2) noise_lin = np.random.normal(loc=0, scale=sigma_lin) ## Noise of spline term std_spl = matrix_squareroot( matrix_positive_part(proj_mat.transpose() @ Sigma @ proj_mat)) noise_spl = np.ravel(std_spl @ np.random.normal( loc=0, scale=1, size=time.size).reshape((n_time, 1))) noise_spl = noise_spl - noise_spl[0] ## Final decomposition gam_model.coef_[-2] += noise_lin X.values[:, j + 1, 0, i] = gam_model.predict(mVt) + noise_spl X.values[:, j + 1, 1, i] = gam_model.predict(mVl) X.values[:, j + 1, 2, i] = gam_model.predict(np.stack( (time, Eant), -1)) + noise_spl if time_center is not None: X_event = X.loc[time_center, :, "all", :] X_center = X - X_event if verbose: pb.end() return XSplitted(X, X_event, X_center)
def gam_decomposition_classic(lX, Enat, Sigma=None, time_center=None, n_splines=None, gam_lam=None, verbose=False): ##{{{ """ NSSEA.gam_decomposition ======================= Perform the decomposition anthropic/natural forcing with GAM arguments --------- """ models = [lx.columns[0] for lx in lX] n_models = len(models) n_sample = Enat.shape[1] - 1 time = np.unique(lX[0].index) n_time = time.size time_l = np.repeat(time[0], n_time) Xa = np.repeat(0., n_time) sample = ["be"] + ["S{}".format(i) for i in range(n_sample)] X = xr.DataArray(np.zeros((n_time, n_sample + 1, 3, n_models)), coords=[time, sample, ["all", "nat", "ant"], models], dims=["time", "sample", "forcing", "models"]) spl_pen = "auto" lin_pen = None if n_splines is None: n_splines = 8 if gam_lam is None: gam_lam = 0.6 pb = ProgressBar("GAM decomposition", n_models * n_sample) for i in range(n_models): Xl = Enat.values[:, 0] x_all = np.stack((time, Xl), -1) x_nat = np.stack((time_l, Xl), -1) ## GAM decomposition gam_model = pg.LinearGAM( pg.s(0, n_splines=n_splines, penalties=spl_pen, lam=gam_lam) + pg.l(1, penalties=lin_pen)) # gam_model = pg.LinearGAM( pg.s( 0 , n_splines = gam_dof - 2 , penalties = spl_pen , lam = 0.9 ) + pg.l( 1 , penalties = lin_pen ) ) # gam_model = pg.LinearGAM( pg.s( 0 , n_splines = gam_dof - 2 , penalties = spl_pen ) + pg.l( 1 , penalties = lin_pen ) ) gam_model.fit( np.stack((lX[i].index, Enat.loc[lX[i].index, 0].values), -1), lX[i].values) X.values[:, 0, 0, i] = gam_model.predict(x_all) X.values[:, 0, 1, i] = gam_model.predict(x_nat) mean_coef = gam_model.coef_ cov_coef = gam_model.statistics_["cov"] for j in range(n_sample): if verbose: pb.print() Xl = Enat.values[:, j + 1] x_all = np.stack((time, Xl), -1) x_nat = np.stack((time_l, Xl), -1) ## Perturbation gam_model.coef_ = np.random.multivariate_normal(mean=mean_coef, cov=cov_coef, size=1).ravel() ## Final decomposition X.values[:, j + 1, 0, i] = gam_model.predict(x_all) X.values[:, j + 1, 1, i] = gam_model.predict(x_nat) X.loc[:, :, "ant", :] = X.loc[:, :, "all", :] - X.loc[:, :, "nat", :] if time_center is not None: X_event = X.loc[time_center, :, "all", :] X_center = X - X_event if verbose: pb.end() return XSplitted(X, X_event, X_center)
irdf['noiseres_linear']=irdf.index.map(lambda x: irdf.noisestrengthlogwstd[x]- \ slope*irdf.wav_stats[x] + const if (irdf.wav_stats[x]>0)& \ (irdf.wav_stats[x]<8)&(irdf.number_reads[x]>100)&(irdf.smoothednumberofpeaks[x]==1)&(irdf.fraction_canonical[x]>0.3) else np.nan) ######### #irdfold=pd.read_pickle(martin + 'combined_analysis/dataframes/irdf_corrected_fromunbiasedmappingwithoutumis_July2018.pkl') #### calculate noise residuals using a generalized additive model meanplusnoise = irdf[(irdf.smoothednumberofpeaks == 1) & (irdf.number_reads > 100) & (irdf.fraction_canonical > 0.3)][[ 'wav_stats', 'rnaperdna', 'noisestrengthlogwstd' ]].dropna() randomgaml = pygam.LinearGAM(pygam.s(0) + pygam.l(1)).gridsearch( meanplusnoise[['wav_stats', 'rnaperdna']].values, meanplusnoise.noisestrengthlogwstd.values, lam=[0.01, 0.1, 1, 5, 10]) gaml = pygam.LinearGAM(pygam.s(0, lam=1, n_splines=10) + pygam.l(1, lam=1)).fit( meanplusnoise[['wav_stats', 'rnaperdna']], meanplusnoise.noisestrengthlogwstd) pred = gaml.predict(meanplusnoise[['wav_stats', 'rnaperdna']]) meanplusnoise['noisegampred'] = pd.Series(pred, index=meanplusnoise.index) meanplusnoise['noiseresgam'] = meanplusnoise[ 'noisestrengthlogwstd'] - meanplusnoise['noisegampred']
return_type='dataframe') sp8t = dmatrix("bs(xtest['Enroll'], df=6, include_intercept=False)", {"xtest['Enroll']": xtest['Enroll']}, return_type='dataframe') x2test = pd.concat([ xtest['Private01'], xtest['Room.Board'], xtest['perc.alumni'], xtest['Grad.Rate'], sp3t, sp4t, sp6t, sp7t, sp8t ], axis=1) ypred = fit2.predict(sm.add_constant(x2test)) print('GAM MSE: %.2f' % mean_squared_error(ytest, ypred)) #3876188.82 # (d) gam = LinearGAM( l(0) + l(1) + s(2, n_splines=6) + f(3) + s(4, n_splines=6) + l(5) + s(6, n_splines=6) + s(7, n_splines=6) + s(8, n_splines=6)).fit(x1, ytrain) for i, term in enumerate(gam.terms): if term.isintercept: continue XX = gam.generate_X_grid(term=i) pdep, confi = gam.partial_dependence(term=i, X=XX, width=0.95) plt.figure() plt.plot(XX[:, term.feature], pdep) plt.plot(XX[:, term.feature], confi, c='r', ls='--') plt.title(repr(term)) plt.show() # Non-linear: Expand, Personal, Accept.
all_permut_preds_alphas=np.empty([1000,1]) # feature weights from the real model will be stored in this array featureWeights_AI=np.empty([100,data_AI.shape[1]-3]) # adding MSE quantification mse_AI=np.empty([100,1]) # run real predictions 100 times. Allows for each subject to be randomly allocated to the testing third multiple times. for split in range(0,100): # for a few different train and test splits # Train and test split from data frame xtrain_AI,xtest_AI,ytrain_AI,ytest_AI,indices_train_AI,indices_test_AI=train_test_split(Featvecs_AI,varofintAI,indices,test_size=0.33,random_state=(split)) # make dataframe of non-brain variables to regress covariates from EF in training df=np.array([age[indices_train_AI],mot[indices_train_AI],varofintAI[indices_train_AI]]) # transpose so subjects are rows dft=np.transpose(df) # regress covariates from EF in training sample (Linear GAM still has spline term) GAMFit=LinearGAM(s(0,n_splines=5) + l(1)).fit(dft,dft[:,2]) # get residuals residsvec=GAMFit.deviance_residuals(dft[:,[0,1,2]],dft[:,2]) # set ytrain to residuals ytrain_AI=residsvec # make equivalent dataframe for testing sample, but fit age and motion effects from training model df2=np.array([age[indices_test_AI],mot[indices_test_AI],varofintAI[indices_test_AI]]) df2t=np.transpose(df2) # apply model to unseen data to get those residuals for testing set testResidsvec=GAMFit.deviance_residuals(df2t[:,[0,1,2]],df2t[:,2]) # replace y test with age/motion controlled EF ytest_AI=testResidsvec # fit model with gcv lm_AI = sklearn.linear_model.RidgeCV(alphas=alphas, store_cv_values=True).fit(xtrain_AI,ytrain_AI) # set prediction alpha to best performing alpha in training set alpha_AI=lm_AI.alpha_
def fit( self, X: pd.DataFrame, Y: pd.DataFrame, ): """ Fit a poisson regression model each for the cases using active_cases and percentage_susceptible at time t-1, and another model for removed using active_cases at time t-1. Args: X (pd.DataFrame): Dataframe for given region of predictor variables containing columns date, province, active_cases, percent_susceptible, and all columns for provinces for {province_name}_active_cases_yesterday, {province_name}_percent_susceptible_yesterday, as well as all log features Y (pd.DataFrame): Dataframe for given region of response variables containing columns date, province, cases, removed """ self.X_original = X.copy() self.Y_original = Y.copy() self.provinces = X["province"].unique() # Fit model for each province self.X_cases = {} self.Y_cases = {} self.X_removed = {} self.Y_removed = {} self.poisson_gam_cases = {} self.poisson_gam_removed = {} for province in self.provinces: # Remove extra columns for given province in form {province}_column_name cols_drop = X.filter(regex=province, axis=1).columns X_province = X.query(f"province == '{province}'").drop(cols_drop, axis=1) Y_province = Y.query(f"province == '{province}'") # Store case dataframe used to train model for each province self.X_cases[province] = X_province.filter( regex= r"(log_active_cases_yesterday|log_percent_susceptible_yesterday)" ) self.Y_cases[province] = Y_province["cases"] # Add terms for each province I_t-1 and Z_t-1. Either splines or linear terms if self.use_splines: terms = s(0, lam=self.lam_main) + s(1, lam=self.lam_main) for i in range(1, len(self.provinces)): terms += s(i * 2, lam=self.lam_other) + s( i * 2 + 1, lam=self.lam_other) else: terms = l(0, lam=self.lam_main) + l(1, lam=self.lam_other) for i in range(1, len(self.provinces)): terms += l(i * 2, lam=self.lam_other) + l( i * 2 + 1, lam=self.lam_other) # Fit cases model for province cases_model = PoissonGAM(terms, verbose=self.verbose) cases_model.fit(self.X_cases[province], self.Y_cases[province]) self.poisson_gam_cases[province] = cases_model # Store remove dataframe used to train model for each province self.X_removed[province] = X_province.filter( regex=r"log_active_cases_yesterday") self.Y_removed[province] = Y_province["removed"] # Add terms for each province I_t-1 terms = l(0, lam=self.lam_main) for i in range(1, len(self.provinces)): terms += l(i, lam=self.lam_other) # Fit removed model for each province removed_model = PoissonGAM(terms, verbose=self.verbose) removed_model.fit(self.X_removed[province], self.Y_cases[province]) self.poisson_gam_removed[province] = removed_model return
fig, ax = plt.subplots(figsize=(5, 5)) partialResidualPlot(result_spline, house_98105, 'AdjSalePrice', 'SqFtTotLiving', ax) plt.tight_layout() plt.show() ### Generalized Additive Models predictors = ['SqFtTotLiving', 'SqFtLot', 'Bathrooms', 'Bedrooms', 'BldgGrade'] outcome = 'AdjSalePrice' X = house_98105[predictors].values y = house_98105[outcome] ## model gam = LinearGAM(s(0, n_splines=12) + l(1) + l(2) + l(3) + l(4)) gam.gridsearch(X, y) print(gam.summary()) fig, axes = plt.subplots(figsize=(8, 8), ncols=2, nrows=3) titles = ['SqFtTotLiving', 'SqFtLot', 'Bathrooms', 'Bedrooms', 'BldgGrade'] for i, title in enumerate(titles): ax = axes[i // 2, i % 2] XX = gam.generate_X_grid(term=i) ax.plot(XX[:, i], gam.partial_dependence(term=i, X=XX)) ax.plot(XX[:, i], gam.partial_dependence(term=i, X=XX, width=.95)[1], c='r', ls='--') ax.set_title(titles[i])
#prep X and y features = ['year', 'age', 'education'] X = df[features].values crude_strCat_to_int(X, 2) y = df['wage'].values # test different types of term on categorical feature # term types: spline (default), linear effect, factor, spline with categorical dtype from pygam import LinearGAM, s, f, l gam1 = LinearGAM(s(0) + s(1) + s(2)).fit(X, y) gam2 = LinearGAM(s(0) + s(1) + l(2)).fit(X, y) gam3 = LinearGAM(s(0) + s(1) + f(2)).fit(X, y) gam4 = LinearGAM(s(0) + s(1) + s(2, dtype='categorical')).fit(X, y) gams = [gam1, gam2, gam3, gam4] terms_names = ['spline', 'linear', 'factor', 'categorical spline'] ########################################### #compare pdp import matplotlib.pyplot as plt plt.rcParams['figure.figsize'] = (28, 28) fig, axs = plt.subplots(4, X.shape[1], sharey='row') for r, axr in enumerate(axs):