def forward_selection(data_train, response, method, display=True): """Linear model designed by forward selection. Parameters: ----------- data : pandas DataFrame with all possible predictors and response response: string, name of response column in data Returns: -------- model: an "optimal" fitted statsmodels linear model with an intercept selected by forward selection evaluated by adjusted R-squared """ remaining = set(data_train.columns) remaining.remove(response) selected = [] current_score, best_new_score = 1e99, 1e99 while remaining and current_score == best_new_score: scores_with_candidates = [] for candidate in remaining: if method == 'lm': res_temp = sm.OLS( data_train[response], sm.add_constant(data_train[selected + [candidate]])).fit() score = res_temp.aic scores_with_candidates.append( (score, candidate, res_temp.pvalues[-1])) elif method == 'glm': res_temp = sm.GLM(data_train[response], sm.add_constant(data_train[selected + [candidate]]), family=sm.families.Gaussian()).fit() score = res_temp.aic scores_with_candidates.append( (score, candidate, res_temp.pvalues[-1])) elif method == 'gam': res_temp = pygam.LinearGAM().fit( data_train[selected + [candidate]], data_train[response]) score = res_temp.statistics_['AIC'] scores_with_candidates.append( (score, candidate, res_temp.statistics_['p_values'][-1])) scores_with_candidates.sort() best_new_score, best_candidate, p_value = scores_with_candidates[:: -1].pop( ) # print(best_candidate,best_new_score,p_value, current_score) if (current_score > best_new_score) & (p_value <= 0.05): remaining.remove(best_candidate) selected.append(best_candidate) current_score = best_new_score if method == 'lm': res_temp = sm.OLS(data_train[response], sm.add_constant(data_train[selected])).fit() elif method == 'glm': res_temp = sm.GLM(data_train[response], sm.add_constant(data_train[selected]), family=sm.families.Gaussian()).fit() elif method == 'gam': res_temp = pygam.LinearGAM().fit(data_train[selected], data_train[response]) if display == True: print(res_temp.summary()) return res_temp
def covariance(self, mean: Optional[T] = None, smooth: Optional[str] = None, **kwargs) -> T: """Compute an estimate of the covariance. Parameters ---------- smooth: str, default=None Name of the smoothing method to use. Currently, not implemented. mean: DenseFunctionalData, default=None An estimate of the mean of self. If None, an estimate is computed. Returns ------- obj: DenseFunctionalData object An estimate of the covariance as a two-dimensional DenseFunctionalData object with same argvals as `self`. Keyword Args ------------ kernel_name: str, default='epanechnikov' Name of the kernel used for local polynomial smoothing. degree: int, default=1 Degree used for local polynomial smoothing. bandwidth: float, default=1 Bandwidth used for local polynomial smoothing. n_basis: int, default=10 Number of splines basis used for GAM smoothing. References ---------- * Yao, Müller and Wang (2005), Functional Data Analysis for Sparse Longitudinal Data, Journal of the American Statistical Association, Vol. 100, No. 470 * Staniswalis, J. G., and Lee, J. J. (1998), “Nonparametric Regression Analysis of Longitudinal Data,” Journal of the American Statistical Association, 93, 1403–1418. """ if self.n_dim > 1: raise ValueError('Only one dimensional functional data are' ' supported') p = self.n_points['input_dim_0'] argvals = self.argvals['input_dim_0'] if mean is None: mean = self.mean(smooth) data = self.values - mean.values cov = np.dot(data.T, data) / (self.n_obs - 1) cov_diag = np.copy(np.diag(cov)) if smooth is not None: # Remove covariance diagonale because of measurement errors. np.fill_diagonal(cov, None) cov = cov[~np.isnan(cov)] # Define train vector train_ = np.vstack((np.repeat(argvals, repeats=len(argvals)), np.tile(argvals, reps=len(argvals)))) train = train_[:, train_[0, :] != train_[1, :]] if smooth == 'LocalLinear': points = kwargs.get('points', 0.5) neigh = kwargs.get('neighborhood', np.int(p * np.exp(-(np.log(np.log(p)))**2))) data_smooth = self.smooth(points=points, neighborhood=neigh) data = data_smooth.values - mean.values cov = np.dot(data.T, data) / (self.n_obs - 1) elif smooth == 'GAM': n_basis = kwargs.get('n_basis', 10) cov = pygam.LinearGAM(pygam.te(0, 1, n_splines=n_basis)).\ fit(np.transpose(train), cov).\ predict(np.transpose(train_)).\ reshape((len(argvals), len(argvals))) else: raise NotImplementedError('Smoothing method not implemented.') # Ensure the covariance is symmetric. cov = (cov + cov.T) / 2 # Smoothing the diagonal of the covariance (Yao, Müller and Wang, 2005) lp = LocalPolynomial(kernel_name=kwargs.get('kernel_name', 'gaussian'), bandwidth=kwargs.get('bandwidth', 1), degree=kwargs.get('degree', 1)) var_hat = lp.fit_predict(argvals, cov_diag, argvals) # Estimate noise variance (Staniswalis and Lee, 1998) ll = argvals[len(argvals) - 1] - argvals[0] lower = np.sum(~(argvals >= (argvals[0] + 0.25 * ll))) upper = np.sum((argvals <= (argvals[len(argvals) - 1] - 0.25 * ll))) weights = integration_weights_(argvals[lower:upper], method='trapz') nume = np.dot(weights, (var_hat - cov_diag)[lower:upper]) self.var_noise = np.maximum(nume / argvals[upper] - argvals[lower], 0) new_argvals = {'input_dim_0': argvals, 'input_dim_1': argvals} return DenseFunctionalData(new_argvals, cov[np.newaxis])
X = preprocessing.StandardScaler().\ fit_transform(spike_rates_0p25[speed_corr_neurons_index].transpose()) # Or not Y = np.array(speeds_0p25[:-1]) X = spike_rates_0p25[speed_corr_neurons_index].transpose() # Set up the regressors model_linear = linear_model.LinearRegression(fit_intercept=True) model_lassoCV = linear_model.LassoCV(cv=5, fit_intercept=True) model_lasso = linear_model.Lasso(alpha=0.02, fit_intercept=True, max_iter=10000, normalize=False) model_gam = pg.LinearGAM() Y = np.exp(Y) / (np.exp(Y) + 1) model_gam = pg.GammaGAM() # Split the data to train and test sets X_train, X_test, Y_train, Y_test = model_selection.train_test_split( X, Y, test_size=0.2, random_state=0) # Fit model = model_gam #model.fit(X_train, Y_train) model.gridsearch(X_train, Y_train) # Show results plt.figure(1)
def gam_decomposition_old_old_old(Xd, Enat, Sigma=None, time_center=None, gam_dof=7, verbose=False): ##{{{ """ NSSEA.gam_decomposition ======================= Perform the decomposition anthropic/natural forcing with GAM arguments --------- """ models = Xd.columns.to_list() n_models = Xd.shape[1] n_sample = Enat.shape[1] - 1 time = Xd.index.values n_time = time.size time_l = np.repeat(time[0], n_time) Eant = np.repeat(0., n_time) sample = ["be"] + ["S{}".format(i) for i in range(n_sample)] X = xr.DataArray(np.zeros((n_time, n_sample + 1, 3, n_models)), coords=[time, sample, ["all", "nat", "ant"], models], dims=["time", "sample", "forcing", "models"]) pb = ProgressBar("GAM decomposition", n_models * n_sample) for i in range(n_models): gam_model = pg.LinearGAM( pg.s(0, n_splines=gam_dof - 2, penalties=None) + pg.l(1, penalties=None)) gam_model.fit(np.stack((time, Enat.values[:, 0]), -1), Xd.values[:, i]) X.values[:, 0, 0, i] = gam_model.predict(np.stack((time, Enat.values[:, 0]), -1)) X.values[:, 0, 1, i] = gam_model.predict( np.stack((time_l, Enat.values[:, 0]), -1)) X.values[:, 0, 2, i] = gam_model.predict(np.stack((time, Eant), -1)) for j in range(n_sample): if verbose: pb.print() Xl = Enat.values[:, j + 1] mVt = np.stack((time, Xl), -1) mVl = np.stack((time_l, Xl), -1) ## GAM decomposition gam_model = pg.LinearGAM( pg.s(0, n_splines=gam_dof - 2, penalties=None) + pg.l(1, penalties=None)) gam_model.fit(mVt, Xd.values[:, i]) ## Coefficients of decomposition int_coef = gam_model.coef_[-1] lin_coef = gam_model.coef_[-2] spl_coef = gam_model.coef_[:-2] spl_mat = gam_model._modelmat(mVt).todense()[:, :-2] proj_mat = spl_mat @ np.linalg.inv(spl_mat.T @ spl_mat) @ spl_mat.T ## Noise of linear term sigma_lin = np.sqrt( (Xl.transpose() @ Sigma @ Xl) / (Xl.transpose() @ Xl)**2) noise_lin = np.random.normal(loc=0, scale=sigma_lin) ## Noise of spline term std_spl = matrix_squareroot( matrix_positive_part(proj_mat.transpose() @ Sigma @ proj_mat)) noise_spl = np.ravel(std_spl @ np.random.normal( loc=0, scale=1, size=time.size).reshape((n_time, 1))) noise_spl = noise_spl - noise_spl[0] ## Final decomposition gam_model.coef_[-2] += noise_lin X.values[:, j + 1, 0, i] = gam_model.predict(mVt) + noise_spl X.values[:, j + 1, 1, i] = gam_model.predict(mVl) X.values[:, j + 1, 2, i] = gam_model.predict(np.stack( (time, Eant), -1)) + noise_spl if time_center is not None: X_event = X.loc[time_center, :, "all", :] X_center = X - X_event if verbose: pb.end() return XSplitted(X, X_event, X_center)
def gam_decomposition_classic(lX, Enat, Sigma=None, time_center=None, n_splines=None, gam_lam=None, verbose=False): ##{{{ """ NSSEA.gam_decomposition ======================= Perform the decomposition anthropic/natural forcing with GAM arguments --------- """ models = [lx.columns[0] for lx in lX] n_models = len(models) n_sample = Enat.shape[1] - 1 time = np.unique(lX[0].index) n_time = time.size time_l = np.repeat(time[0], n_time) Xa = np.repeat(0., n_time) sample = ["be"] + ["S{}".format(i) for i in range(n_sample)] X = xr.DataArray(np.zeros((n_time, n_sample + 1, 3, n_models)), coords=[time, sample, ["all", "nat", "ant"], models], dims=["time", "sample", "forcing", "models"]) spl_pen = "auto" lin_pen = None if n_splines is None: n_splines = 8 if gam_lam is None: gam_lam = 0.6 pb = ProgressBar("GAM decomposition", n_models * n_sample) for i in range(n_models): Xl = Enat.values[:, 0] x_all = np.stack((time, Xl), -1) x_nat = np.stack((time_l, Xl), -1) ## GAM decomposition gam_model = pg.LinearGAM( pg.s(0, n_splines=n_splines, penalties=spl_pen, lam=gam_lam) + pg.l(1, penalties=lin_pen)) # gam_model = pg.LinearGAM( pg.s( 0 , n_splines = gam_dof - 2 , penalties = spl_pen , lam = 0.9 ) + pg.l( 1 , penalties = lin_pen ) ) # gam_model = pg.LinearGAM( pg.s( 0 , n_splines = gam_dof - 2 , penalties = spl_pen ) + pg.l( 1 , penalties = lin_pen ) ) gam_model.fit( np.stack((lX[i].index, Enat.loc[lX[i].index, 0].values), -1), lX[i].values) X.values[:, 0, 0, i] = gam_model.predict(x_all) X.values[:, 0, 1, i] = gam_model.predict(x_nat) mean_coef = gam_model.coef_ cov_coef = gam_model.statistics_["cov"] for j in range(n_sample): if verbose: pb.print() Xl = Enat.values[:, j + 1] x_all = np.stack((time, Xl), -1) x_nat = np.stack((time_l, Xl), -1) ## Perturbation gam_model.coef_ = np.random.multivariate_normal(mean=mean_coef, cov=cov_coef, size=1).ravel() ## Final decomposition X.values[:, j + 1, 0, i] = gam_model.predict(x_all) X.values[:, j + 1, 1, i] = gam_model.predict(x_nat) X.loc[:, :, "ant", :] = X.loc[:, :, "all", :] - X.loc[:, :, "nat", :] if time_center is not None: X_event = X.loc[time_center, :, "all", :] X_center = X - X_event if verbose: pb.end() return XSplitted(X, X_event, X_center)
slope*irdf.wav_stats[x] + const if (irdf.wav_stats[x]>0)& \ (irdf.wav_stats[x]<8)&(irdf.number_reads[x]>100)&(irdf.smoothednumberofpeaks[x]==1)&(irdf.fraction_canonical[x]>0.3) else np.nan) ######### #irdfold=pd.read_pickle(martin + 'combined_analysis/dataframes/irdf_corrected_fromunbiasedmappingwithoutumis_July2018.pkl') #### calculate noise residuals using a generalized additive model meanplusnoise = irdf[(irdf.smoothednumberofpeaks == 1) & (irdf.number_reads > 100) & (irdf.fraction_canonical > 0.3)][[ 'wav_stats', 'rnaperdna', 'noisestrengthlogwstd' ]].dropna() randomgaml = pygam.LinearGAM(pygam.s(0) + pygam.l(1)).gridsearch( meanplusnoise[['wav_stats', 'rnaperdna']].values, meanplusnoise.noisestrengthlogwstd.values, lam=[0.01, 0.1, 1, 5, 10]) gaml = pygam.LinearGAM(pygam.s(0, lam=1, n_splines=10) + pygam.l(1, lam=1)).fit( meanplusnoise[['wav_stats', 'rnaperdna']], meanplusnoise.noisestrengthlogwstd) pred = gaml.predict(meanplusnoise[['wav_stats', 'rnaperdna']]) meanplusnoise['noisegampred'] = pd.Series(pred, index=meanplusnoise.index) meanplusnoise['noiseresgam'] = meanplusnoise[ 'noisestrengthlogwstd'] - meanplusnoise['noisegampred'] irdf['noiseresgam'] = meanplusnoise['noiseresgam']
plt.xticks([0,0.5,1]) plt.xlabel('fraction of random sets with\nvariance<variance(barcode control set)') f.savefig('./figures/Fig6/Fig6D_five_bccontrols_vs_randomdistribution_overview.png', \ dpi = 300, format='png', bbox_inches='tight', frameon=True) #%% ################ # Overview plots, relationship mean splicing values - noise ################ meanplusnoise=irdf[(irdf.smoothednumberofpeaks==1)&(irdf.number_reads>100)&(irdf.fraction_canonical>0.3)][['wav_stats','rnaperdna','noisestrengthlogwstd']].dropna() gaml=pygam.LinearGAM(pygam.s(0,lam=1, n_splines=10)+pygam.l(1,lam=1)).fit(meanplusnoise[['wav_stats','rnaperdna']], meanplusnoise.noisestrengthlogwstd) pred=gaml.predict(meanplusnoise[['wav_stats','rnaperdna']]) meanplusnoise['noisegampred']=pd.Series(pred, index=meanplusnoise.index) meanplusnoise['noiseresgam']=meanplusnoise['noisestrengthlogwstd']-meanplusnoise['noisegampred'] f=plt.figure(figsize=(4,4)) plt.scatter(meanplusnoise.wav_stats,\ meanplusnoise.noisestrengthlogwstd, s=10, alpha=0.2, color=sns.xkcd_rgb['medium blue']) plt.plot(meanplusnoise.wav_stats, meanplusnoise.noisegampred, '.', color=sns.xkcd_rgb['light green'], alpha=0.2, markersize=5) plt.xlabel('splicing value') plt.ylabel('splicing noise strength [log2]') plt.ylim(-9,3) plt.xlim(0,7.2)