def _build_ensemble_feature(self, X, base_pred): """Builds featurre array and corresponding GAM TermList. Terms corresponding to X will be summation of dimension-wise splines, plus a tensor-product term across all dimension. """ ensemble_term_func = s if self.nonlinear_ensemble else l ens_feature = np.asarray(list(base_pred.values())).T term_list = [ensemble_term_func(dim_index) for dim_index in range(ens_feature.shape[1])] # optionally, add residual process if self.model_residual: # build gam terms term_list += [s(dim_index) for dim_index in range(ens_feature.shape[1], ens_feature.shape[1] + X.shape[1])] if X.shape[1] > 1: term_list += [te(*list(ens_feature.shape[1] + np.array(range(X.shape[1]))))] # update features ens_feature = np.concatenate([ens_feature, X], axis=1) gam_feature_terms = TermList(*term_list) return ens_feature, gam_feature_terms
def BAM(X, y): # model implementation by PYGAM gam = LinearGAM(s(0, spline_order=3) + s(1, spline_order=3) + te(0, 1)) gam.gridsearch(X, y) # print(gam.gridsearch(X, y).summary()) return gam
def interp_gam(data): valid = np.isfinite(data.stream_dist.values[:, 0]) sample_xy = data.sample_xy.values[valid] sample_st = data.stream_dist.values[valid] sample_z = data.sample_z.values[valid] if np.sum(valid) == 0: return np.nan gam = LinearGAM( s(0, n_splines=4) + s(1, n_splines=5) + te(0, 1, n_splines=4)).gridsearch(sample_st, sample_z) z_pred = gam.predict(np.array([[0, 0]]))[0] return z_pred
def get_GAM_predictions(Xtrain, Ytrain, Xtest): """ Perform grid search and train Linear GAM model and return predictions for the test set. :param Xtrain: X values for training. :param Ytrain: Y values for training. :param Xtest: X values for validation. :return: Predictions from Linear GAM model for test dataset """ # Create an array of lambda values to search lams = np.logspace(-3, 20, 35) # GAM search requires numpy arrays Xtrain_np = np.array(Xtrain, dtype=np.float64) Ytrain_np = np.array(Ytrain, dtype=np.float64) # Linear Generalised Additive Model model = LinearGAM( s(99) + s(100) + l(3) + l(6) + l(8) + l(11) + l(7) + l(9) + l(12) + l(10) + l(14) + l(29) + l(15) + l(71) + l(17) + l(21) + l(107) + l(16) + l(68) + l(78) + l(61) + l(55) + l(31) + l(13) + l(37) + l(4) + l(5) + l(2) + te(4, 5) + te(68, 78)).gridsearch(Xtrain_np, Ytrain_np, lam=lams) return model.predict(Xtest)
def AAM(): gam = LinearGAM(s(0, n_splines=25, spline_order=3, constraints='concave', penalties = 'auto', basis = 'cp', edge_knots=[147, 147]) + l(3) # the last travel time + te(0, 1) # distance and departure_time + te(2, 0) # distance and isWeekend + l(2), # isWeekend fit_intercept=True) print(gam.gridsearch(X1, y1).summary()) # print(gam.gridsearch(X1,y1).get_params(deep=True)) '''plt.scatter(X1[:,0][0:56], y1[0:56], s=3, linewidth=1, label = 'data') plt.plot(X1[:,0][0:56], gam.predict(X1[0:56]), color = 'red', linewidth = 1, label = 'prediction') plt.legend() plt.title('Extended Additive Model') plt.show()''' # error calculation rmse_val = rmse(np.array(y1), np.array(gam.predict(X1))) print("RMSE is: "+str(rmse_val)) mae = mean_absolute_error(y1, gam.predict(X1)) print("MAE is: "+str(mae)) mape = mean_absolute_percentage_error(np.array(y1), np.array(gam.predict(X1))) print("MAPE is: "+ str(mape))
def BAM(): gam = GAM(s(0, n_splines=25, spline_order=3, constraints='concave', penalties = 'auto', basis = 'cp', edge_knots=[147,147]) + s(1, n_splines=25, spline_order=3, constraints='concave', penalties = 'auto', basis = 'cp', edge_knots=[147,147]) + te(0, 1, dtype=['numerical', 'numerical']), distribution= 'normal', link = 'identity', fit_intercept=True) print(gam.gridsearch(X, y, n_splines=np.arange(50)).summary()) plt.scatter(X[:, 0][0:56], y[0:56], s=3, linewidths=0.0001, label='data') plt.plot(X[:, 0][0:56], gam.predict(X[0:56]), color='red', linewidth=1, label='prediction') plt.legend() plt.title('Basic Additive Model') plt.show() # error calculation rmse_val = rmse(np.array(y), np.array(gam.predict(X))) print("RMSE is: " + str(rmse_val)) mae = mean_absolute_error(y, gam.predict(X)) print("MAE is: " + str(mae)) mape = mean_absolute_percentage_error(np.array(y), np.array(gam.predict(X))) print("MAPE is: " + str(mape))
def gamSplineSens(preinterpsurfaces): sumerrors=[] for splines in range(4,16): error=[] for k in preinterpsurfaces.keys(): if int(k)>2000: surface = preinterpsurfaces[k] X = np.zeros((len(surface["lons"]),2)) X[:,0]=surface["lons"] X[:,1]=surface["lats"] #for d in Bar("Interpolating: ").iter(surface["data"].keys()): d = "pres" notnan = ~np.isnan(surface["data"][d]) if np.count_nonzero(notnan)>10: gam = pygam.GAM(pygam.te(0,1,n_splines=[splines,splines])).fit(X[notnan],np.asarray(surface["data"][d])[notnan]) #random_gam = pygam.LinearGAM(pygam.s(0) + pygam.s(1) ).gridsearch(X, surface["data"][d]) error += list(np.log10(np.abs(surface["data"][d]-gam.predict(X)))) sns.distplot(error,kde_kws={"fill":False,"label": str(splines)}) #plt.plot(range(4,16),sumerrors) plt.legend() plt.show()
############################################################ # https://pygam.readthedocs.io/en/latest/notebooks/tour_of_pygam.html #Fitting and plotting interactions with te() from pygam import PoissonGAM, s, te from pygam.datasets import chicago X, y = chicago(return_X_y=True) X.shape gam = PoissonGAM(s(0, n_splines=200) + te(3, 1) + s(2)).fit(X, y) import matplotlib.pyplot as plt from mpl_toolkits import mplot3d plt.ion() plt.rcParams['figure.figsize'] = (12, 8) XX = gam.generate_X_grid(term=1, meshgrid=True) Z = gam.partial_dependence(term=1, X=XX, meshgrid=True) ax = plt.axes(projection='3d') ax.plot_surface(XX[0], XX[1], Z, cmap='viridis') #Simple interactions, copare with te() from pygam import LinearGAM, s from pygam.datasets import toy_interaction X, y = toy_interaction(return_X_y=True)
def covariance(self, mean=None, smooth=None, **kwargs): """Compute an estimate of the covariance. Parameters ---------- smooth: str, default=None Name of the smoothing method to use. Currently, not implemented. mean: DenseFunctionalData, default=None An estimate of the mean of self. If None, an estimate is computed. Returns ------- obj: DenseFunctionalData object An estimate of the covariance as a two-dimensional DenseFunctionalData object with same argvals as `self`. Keyword Args ------------ kernel_name: str, default='epanechnikov' Name of the kernel used for local polynomial smoothing. degree: int, default=1 Degree used for local polynomial smoothing. bandwidth: float, default=1 Bandwidth used for local polynomial smoothing. n_basis: int, default=10 Number of splines basis used for GAM smoothing. References ---------- * Yao, Müller and Wang (2005), Functional Data Analysis for Sparse Longitudinal Data, Journal of the American Statistical Association, Vol. 100, No. 470 * Staniswalis, J. G., and Lee, J. J. (1998), “Nonparametric Regression Analysis of Longitudinal Data,” Journal of the American Statistical Association, 93, 1403–1418. """ if self.n_dim > 1: raise ValueError('Only one dimensional functional data are' ' supported') p = self.n_points['input_dim_0'] argvals = self.argvals['input_dim_0'] if mean is None: mean = self.mean(smooth) data = self.values - mean.values cov = np.dot(data.T, data) / (self.n_obs - 1) cov_diag = np.copy(np.diag(cov)) if smooth is not None: # Remove covariance diagonale because of measurement errors. np.fill_diagonal(cov, None) cov = cov[~np.isnan(cov)] # Define train vector train_ = np.vstack((np.repeat(argvals, repeats=len(argvals)), np.tile(argvals, reps=len(argvals)))) train = train_[:, train_[0, :] != train_[1, :]] if smooth == 'LocalLinear': points = kwargs.get('points', 0.5) neigh = kwargs.get('neighborhood', np.int(p * np.exp(-(np.log(np.log(p)))**2))) data_smooth = self.smooth(points=points, neighborhood=neigh) data = data_smooth.values - mean.values cov = np.dot(data.T, data) / (self.n_obs - 1) elif smooth == 'GAM': n_basis = kwargs.get('n_basis', 10) cov = pygam.LinearGAM(pygam.te(0, 1, n_splines=n_basis)).\ fit(np.transpose(train), cov).\ predict(np.transpose(train_)).\ reshape((len(argvals), len(argvals))) else: raise NotImplementedError('Smoothing method not implemented.') # Ensure the covariance is symmetric. cov = (cov + cov.T) / 2 # Smoothing the diagonal of the covariance (Yao, Müller and Wang, 2005) lp = LocalPolynomial(kernel_name=kwargs.get('kernel_name', 'gaussian'), bandwidth=kwargs.get('bandwidth', 1), degree=kwargs.get('degree', 1)) var_hat = lp.fit_predict(argvals, cov_diag, argvals) # Estimate noise variance (Staniswalis and Lee, 1998) ll = argvals[len(argvals) - 1] - argvals[0] lower = np.sum(~(argvals >= (argvals[0] + 0.25 * ll))) upper = np.sum((argvals <= (argvals[len(argvals) - 1] - 0.25 * ll))) weights = integration_weights_(argvals[lower:upper], method='trapz') nume = np.dot(weights, (var_hat - cov_diag)[lower:upper]) self.var_noise = np.maximum(nume / argvals[upper] - argvals[lower], 0) new_argvals = {'input_dim_0': argvals, 'input_dim_1': argvals} return DenseFunctionalData(new_argvals, cov[np.newaxis])
'pi', 'beta', 'rho', 'u', 'delta00', 'delta01', 'delta10', 'delta11', 'mu1', 'mu2', 'mu3' ] texnamesx = [ '$\\pi$', '$\\beta$', '$\\rho$', '$u$', '$\\delta_{00}$', '$\\delta_{01}$', '$\\delta_{10}$', '$\\delta_{11}$', '$\\mu_1$', '$\\mu_2$', '$\\mu_3$' ] texnamesy = [ '$f_1(\\pi)$', '$f_2(\\beta)$', '$f_3(\\rho)$', '$f_4(u)$', '$f_5(\\delta_{00})$', '$f_6(\\delta_{01})$', '$f_7(\\delta_{10})$', '$f_8(\\delta_{11})$', '$f_9(\\mu_1)$', '$f_10(\\mu_2)$', '$f_11(\\mu_3)$' ] fit5 = GammaGAM().fit(res[features], res.sigma5) fit30 = GammaGAM(terms=s(0) + s(1) + s(2) + s(3) + s(4) + s(5) + s(6) + s(7) + s(8) + s(9) + s(10) + te(9, 10) + te(1, 3)).fit( res[features], res.sigma30) fit30 = fit30.gridsearch(res[features], res.sigma30, lam=np.logspace(3, 4, 100)) fit60 = GammaGAM().fit(res[features], res.sigma60) fitSpread = GammaGAM().fit(res[features], res.meanSpread) # %% Volatility 30 Plots x_grid = np.array([ np.linspace(min(res[feature]), max(res[feature]) * 1, 1000) for feature in features ]) for k in range(len(features)): pdeps, cof = fit30.partial_dependence(k, width=0.95, X=x_grid.T)
def EAM(): X = np.load('EAM_factors.npy') y = np.load('EAM_time.npy') gam = LinearGAM( s(0, spline_order=3) + s(1, spline_order=3) + te(0, 1) + te(0, 2)) gam.gridsearch(X, y)
# (38 previous) # 27,29121,0,22838721,264,11.9,0,69,9.6,4,0.0,60000,-11,1474355121,38,40 tester = [29121, 0, 264, 11.9, 0, 69, 9.6, 4] y_pred = gam.predict([tester]) print(y_pred) my_data = pd.read_csv("result_Charlemont Street.csv") attributes = ['time_of_day', 'type_of_day', 'day_of_year', 'temperature', 'rain', 'relative_humidity', 'vapour_pressure', 'wind_speed'] X = my_data[attributes].values # Xother = my_data[['time_of_day', 'type_of_day', 'day_of_year', 'temperature']].values y = my_data['available_bike_stands'].values gam = LinearGAM(te(0, 1) + s(2) + s(3) + s(4) + s(5) + s(6) + s(7), n_splines=[25, 10, 10, 10, 10, 10, 10, 10], dtype=['numerical', 'categorical', 'numerical', 'numerical', 'numerical', 'numerical', 'numerical', 'numerical']) gam.gridsearch(X, y) gam.summary() display_breakdown() test_model(gam)
#%% # plotting # plotting fig = plt.figure() ax = plt.axes(projection='3d') nr = 2 ax.scatter3D(X[:, 1][::nr], X[:, 0][::nr], y[::nr], c=y[::2], cmap='Spectral') plt.show() #%% # pyGAM from pygam import LinearGAM, s, te, PoissonGAM, f, GAM gam = GAM( s(0, constraints="monotonic_inc", n_splines=15) + s(1) + #, constraints="concave", n_splines=100) + te(1, 0)) gam.fit(X_train, y_train) titles = ['QDot[l/min*m]', 'TemperaturStart'] fig, axs = plt.subplots(1, len(titles), figsize=(13, 9)) # plot partial dependences for i, ax in enumerate(axs): print("i = ", i) XX = gam.generate_X_grid(term=i) ax.plot(XX[:, i], gam.partial_dependence(term=i, X=XX)) ax.plot(XX[:, i], gam.partial_dependence(term=i, X=XX, width=.95)[1], c='r') ax.set_title(titles[i]) ax.grid()
def train(X, y): gam = LinearGAM(s(0) + s(1) + te(0, 1)).fit(X, y) # gam.summary() return gam
# AIC=2005 # gam=LinearGAM( s(0,n_splines=4) + s(1,n_splines=4) + te(0,1,n_splines=4) ).gridsearch(sample_st,sample_z) # AIC: 2760, but arguably the best looking. # gam=LinearGAM( s(0,n_splines=4) + s(1,n_splines=5) + te(0,1,n_splines=4) ).gridsearch(sample_st,sample_z) # AIC=1345 # gam=LinearGAM( s(0,n_splines=4) + s(1,n_splines=4) ).gridsearch(sample_st,sample_z) # AIC=1500 -- looks terrible # gam=LinearGAM( s(0,n_splines=4) ).gridsearch(sample_st,sample_z) # AIC=1250 -- meh. # gam=LinearGAM( s(1,n_splines=4) ).gridsearch(sample_st,sample_z) # AIC 2001 # gam=LinearGAM( te(0,1,n_splines=4) ).gridsearch(sample_st,sample_z) # AIC 2900 #gam=LinearGAM( te(0,1,n_splines=5) ).gridsearch(sample_st,sample_z) # 6900, but looks okay gam = LinearGAM(s(1, n_splines=5) + te(0, 1, n_splines=6)).gridsearch( sample_st, sample_z) print("AIC: ", gam.statistics_['AIC']) # gam.summary() #-- plt.figure(2).clf() nterms = len(gam.terms) - 1 # omit intercept fig, term_axs = plt.subplots(1, nterms, num=2) if nterms == 1: term_axs = [term_axs] titles = [repr(t) for t in gam.terms] for i, ax in enumerate(term_axs): XX = gam.generate_X_grid(term=i)