Ejemplo n.º 1
0
def forward_selection(data_train, response, method, display=True):
    """Linear model designed by forward selection.

    Parameters:
    -----------
    data : pandas DataFrame with all possible predictors and response

    response: string, name of response column in data

    Returns:
    --------
    model: an "optimal" fitted statsmodels linear model
           with an intercept
           selected by forward selection
           evaluated by adjusted R-squared
    """
    remaining = set(data_train.columns)
    remaining.remove(response)
    selected = []

    current_score, best_new_score = 1e99, 1e99

    while remaining and current_score == best_new_score:
        scores_with_candidates = []

        for candidate in remaining:
            if method == 'lm':
                res_temp = sm.OLS(
                    data_train[response],
                    sm.add_constant(data_train[selected + [candidate]])).fit()
                score = res_temp.aic
                scores_with_candidates.append(
                    (score, candidate, res_temp.pvalues[-1]))
            elif method == 'glm':
                res_temp = sm.GLM(data_train[response],
                                  sm.add_constant(data_train[selected +
                                                             [candidate]]),
                                  family=sm.families.Gaussian()).fit()
                score = res_temp.aic
                scores_with_candidates.append(
                    (score, candidate, res_temp.pvalues[-1]))
            elif method == 'gam':
                res_temp = pygam.LinearGAM().fit(
                    data_train[selected + [candidate]], data_train[response])
                score = res_temp.statistics_['AIC']
                scores_with_candidates.append(
                    (score, candidate, res_temp.statistics_['p_values'][-1]))

        scores_with_candidates.sort()
        best_new_score, best_candidate, p_value = scores_with_candidates[::
                                                                         -1].pop(
                                                                         )

        # print(best_candidate,best_new_score,p_value, current_score)
        if (current_score > best_new_score) & (p_value <= 0.05):
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score

    if method == 'lm':
        res_temp = sm.OLS(data_train[response],
                          sm.add_constant(data_train[selected])).fit()
    elif method == 'glm':
        res_temp = sm.GLM(data_train[response],
                          sm.add_constant(data_train[selected]),
                          family=sm.families.Gaussian()).fit()
    elif method == 'gam':
        res_temp = pygam.LinearGAM().fit(data_train[selected],
                                         data_train[response])

    if display == True:
        print(res_temp.summary())

    return res_temp
Ejemplo n.º 2
0
    def covariance(self,
                   mean: Optional[T] = None,
                   smooth: Optional[str] = None,
                   **kwargs) -> T:
        """Compute an estimate of the covariance.

        Parameters
        ----------
        smooth: str, default=None
            Name of the smoothing method to use. Currently, not implemented.
        mean: DenseFunctionalData, default=None
            An estimate of the mean of self. If None, an estimate is computed.

        Returns
        -------
        obj: DenseFunctionalData object
            An estimate of the covariance as a two-dimensional
            DenseFunctionalData object with same argvals as `self`.

        Keyword Args
        ------------
        kernel_name: str, default='epanechnikov'
            Name of the kernel used for local polynomial smoothing.
        degree: int, default=1
            Degree used for local polynomial smoothing.
        bandwidth: float, default=1
            Bandwidth used for local polynomial smoothing.
        n_basis: int, default=10
            Number of splines basis used for GAM smoothing.

        References
        ----------
        * Yao, Müller and Wang (2005), Functional Data Analysis for Sparse
        Longitudinal Data,
        Journal of the American Statistical Association, Vol. 100, No. 470
        * Staniswalis, J. G., and Lee, J. J. (1998), “Nonparametric Regression
        Analysis of Longitudinal Data,” Journal of the American Statistical
        Association, 93, 1403–1418.

        """
        if self.n_dim > 1:
            raise ValueError('Only one dimensional functional data are'
                             ' supported')

        p = self.n_points['input_dim_0']
        argvals = self.argvals['input_dim_0']
        if mean is None:
            mean = self.mean(smooth)
        data = self.values - mean.values
        cov = np.dot(data.T, data) / (self.n_obs - 1)
        cov_diag = np.copy(np.diag(cov))

        if smooth is not None:
            # Remove covariance diagonale because of measurement errors.
            np.fill_diagonal(cov, None)
            cov = cov[~np.isnan(cov)]

            # Define train vector
            train_ = np.vstack((np.repeat(argvals, repeats=len(argvals)),
                                np.tile(argvals, reps=len(argvals))))

            train = train_[:, train_[0, :] != train_[1, :]]

            if smooth == 'LocalLinear':
                points = kwargs.get('points', 0.5)
                neigh = kwargs.get('neighborhood',
                                   np.int(p * np.exp(-(np.log(np.log(p)))**2)))
                data_smooth = self.smooth(points=points, neighborhood=neigh)
                data = data_smooth.values - mean.values
                cov = np.dot(data.T, data) / (self.n_obs - 1)
            elif smooth == 'GAM':
                n_basis = kwargs.get('n_basis', 10)

                cov = pygam.LinearGAM(pygam.te(0, 1, n_splines=n_basis)).\
                    fit(np.transpose(train), cov).\
                    predict(np.transpose(train_)).\
                    reshape((len(argvals), len(argvals)))
            else:
                raise NotImplementedError('Smoothing method not implemented.')

        # Ensure the covariance is symmetric.
        cov = (cov + cov.T) / 2

        # Smoothing the diagonal of the covariance (Yao, Müller and Wang, 2005)
        lp = LocalPolynomial(kernel_name=kwargs.get('kernel_name', 'gaussian'),
                             bandwidth=kwargs.get('bandwidth', 1),
                             degree=kwargs.get('degree', 1))
        var_hat = lp.fit_predict(argvals, cov_diag, argvals)
        # Estimate noise variance (Staniswalis and Lee, 1998)
        ll = argvals[len(argvals) - 1] - argvals[0]
        lower = np.sum(~(argvals >= (argvals[0] + 0.25 * ll)))
        upper = np.sum((argvals <= (argvals[len(argvals) - 1] - 0.25 * ll)))
        weights = integration_weights_(argvals[lower:upper], method='trapz')
        nume = np.dot(weights, (var_hat - cov_diag)[lower:upper])
        self.var_noise = np.maximum(nume / argvals[upper] - argvals[lower], 0)

        new_argvals = {'input_dim_0': argvals, 'input_dim_1': argvals}
        return DenseFunctionalData(new_argvals, cov[np.newaxis])
Ejemplo n.º 3
0
X = preprocessing.StandardScaler().\
    fit_transform(spike_rates_0p25[speed_corr_neurons_index].transpose())

#   Or not
Y = np.array(speeds_0p25[:-1])
X = spike_rates_0p25[speed_corr_neurons_index].transpose()

#   Set up the regressors
model_linear = linear_model.LinearRegression(fit_intercept=True)
model_lassoCV = linear_model.LassoCV(cv=5, fit_intercept=True)
model_lasso = linear_model.Lasso(alpha=0.02,
                                 fit_intercept=True,
                                 max_iter=10000,
                                 normalize=False)
model_gam = pg.LinearGAM()

Y = np.exp(Y) / (np.exp(Y) + 1)
model_gam = pg.GammaGAM()

#   Split the data to train and test sets
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(
    X, Y, test_size=0.2, random_state=0)

# Fit
model = model_gam
#model.fit(X_train, Y_train)
model.gridsearch(X_train, Y_train)

# Show results
plt.figure(1)
Ejemplo n.º 4
0
def gam_decomposition_old_old_old(Xd,
                                  Enat,
                                  Sigma=None,
                                  time_center=None,
                                  gam_dof=7,
                                  verbose=False):  ##{{{
    """
	NSSEA.gam_decomposition
	=======================
	Perform the decomposition anthropic/natural forcing with GAM
	
	arguments
	---------
	"""
    models = Xd.columns.to_list()
    n_models = Xd.shape[1]
    n_sample = Enat.shape[1] - 1
    time = Xd.index.values
    n_time = time.size
    time_l = np.repeat(time[0], n_time)
    Eant = np.repeat(0., n_time)

    sample = ["be"] + ["S{}".format(i) for i in range(n_sample)]
    X = xr.DataArray(np.zeros((n_time, n_sample + 1, 3, n_models)),
                     coords=[time, sample, ["all", "nat", "ant"], models],
                     dims=["time", "sample", "forcing", "models"])

    pb = ProgressBar("GAM decomposition", n_models * n_sample)
    for i in range(n_models):
        gam_model = pg.LinearGAM(
            pg.s(0, n_splines=gam_dof - 2, penalties=None) +
            pg.l(1, penalties=None))
        gam_model.fit(np.stack((time, Enat.values[:, 0]), -1), Xd.values[:, i])

        X.values[:, 0, 0,
                 i] = gam_model.predict(np.stack((time, Enat.values[:, 0]),
                                                 -1))
        X.values[:, 0, 1, i] = gam_model.predict(
            np.stack((time_l, Enat.values[:, 0]), -1))
        X.values[:, 0, 2, i] = gam_model.predict(np.stack((time, Eant), -1))

        for j in range(n_sample):
            if verbose: pb.print()

            Xl = Enat.values[:, j + 1]
            mVt = np.stack((time, Xl), -1)
            mVl = np.stack((time_l, Xl), -1)

            ## GAM decomposition
            gam_model = pg.LinearGAM(
                pg.s(0, n_splines=gam_dof - 2, penalties=None) +
                pg.l(1, penalties=None))
            gam_model.fit(mVt, Xd.values[:, i])

            ## Coefficients of decomposition
            int_coef = gam_model.coef_[-1]
            lin_coef = gam_model.coef_[-2]
            spl_coef = gam_model.coef_[:-2]

            spl_mat = gam_model._modelmat(mVt).todense()[:, :-2]
            proj_mat = spl_mat @ np.linalg.inv(spl_mat.T @ spl_mat) @ spl_mat.T

            ## Noise of linear term
            sigma_lin = np.sqrt(
                (Xl.transpose() @ Sigma @ Xl) / (Xl.transpose() @ Xl)**2)
            noise_lin = np.random.normal(loc=0, scale=sigma_lin)

            ## Noise of spline term
            std_spl = matrix_squareroot(
                matrix_positive_part(proj_mat.transpose() @ Sigma @ proj_mat))
            noise_spl = np.ravel(std_spl @ np.random.normal(
                loc=0, scale=1, size=time.size).reshape((n_time, 1)))
            noise_spl = noise_spl - noise_spl[0]

            ## Final decomposition
            gam_model.coef_[-2] += noise_lin
            X.values[:, j + 1, 0, i] = gam_model.predict(mVt) + noise_spl
            X.values[:, j + 1, 1, i] = gam_model.predict(mVl)
            X.values[:, j + 1, 2,
                     i] = gam_model.predict(np.stack(
                         (time, Eant), -1)) + noise_spl

    if time_center is not None:
        X_event = X.loc[time_center, :, "all", :]
        X_center = X - X_event

    if verbose: pb.end()

    return XSplitted(X, X_event, X_center)
Ejemplo n.º 5
0
def gam_decomposition_classic(lX,
                              Enat,
                              Sigma=None,
                              time_center=None,
                              n_splines=None,
                              gam_lam=None,
                              verbose=False):  ##{{{
    """
	NSSEA.gam_decomposition
	=======================
	Perform the decomposition anthropic/natural forcing with GAM
	
	arguments
	---------
	"""
    models = [lx.columns[0] for lx in lX]
    n_models = len(models)
    n_sample = Enat.shape[1] - 1
    time = np.unique(lX[0].index)
    n_time = time.size
    time_l = np.repeat(time[0], n_time)
    Xa = np.repeat(0., n_time)

    sample = ["be"] + ["S{}".format(i) for i in range(n_sample)]
    X = xr.DataArray(np.zeros((n_time, n_sample + 1, 3, n_models)),
                     coords=[time, sample, ["all", "nat", "ant"], models],
                     dims=["time", "sample", "forcing", "models"])

    spl_pen = "auto"
    lin_pen = None

    if n_splines is None:
        n_splines = 8
    if gam_lam is None:
        gam_lam = 0.6

    pb = ProgressBar("GAM decomposition", n_models * n_sample)
    for i in range(n_models):

        Xl = Enat.values[:, 0]
        x_all = np.stack((time, Xl), -1)
        x_nat = np.stack((time_l, Xl), -1)

        ## GAM decomposition
        gam_model = pg.LinearGAM(
            pg.s(0, n_splines=n_splines, penalties=spl_pen, lam=gam_lam) +
            pg.l(1, penalties=lin_pen))
        #		gam_model = pg.LinearGAM( pg.s( 0 , n_splines = gam_dof - 2 , penalties = spl_pen , lam = 0.9 ) + pg.l( 1 , penalties = lin_pen ) )
        #		gam_model = pg.LinearGAM( pg.s( 0 , n_splines = gam_dof - 2 , penalties = spl_pen ) + pg.l( 1 , penalties = lin_pen ) )
        gam_model.fit(
            np.stack((lX[i].index, Enat.loc[lX[i].index, 0].values), -1),
            lX[i].values)

        X.values[:, 0, 0, i] = gam_model.predict(x_all)
        X.values[:, 0, 1, i] = gam_model.predict(x_nat)

        mean_coef = gam_model.coef_
        cov_coef = gam_model.statistics_["cov"]

        for j in range(n_sample):
            if verbose: pb.print()

            Xl = Enat.values[:, j + 1]
            x_all = np.stack((time, Xl), -1)
            x_nat = np.stack((time_l, Xl), -1)

            ## Perturbation
            gam_model.coef_ = np.random.multivariate_normal(mean=mean_coef,
                                                            cov=cov_coef,
                                                            size=1).ravel()

            ## Final decomposition
            X.values[:, j + 1, 0, i] = gam_model.predict(x_all)
            X.values[:, j + 1, 1, i] = gam_model.predict(x_nat)

    X.loc[:, :, "ant", :] = X.loc[:, :, "all", :] - X.loc[:, :, "nat", :]

    if time_center is not None:
        X_event = X.loc[time_center, :, "all", :]
        X_center = X - X_event

    if verbose: pb.end()

    return XSplitted(X, X_event, X_center)
Ejemplo n.º 6
0
       slope*irdf.wav_stats[x] + const if (irdf.wav_stats[x]>0)& \
    (irdf.wav_stats[x]<8)&(irdf.number_reads[x]>100)&(irdf.smoothednumberofpeaks[x]==1)&(irdf.fraction_canonical[x]>0.3) else np.nan)

#########
#irdfold=pd.read_pickle(martin + 'combined_analysis/dataframes/irdf_corrected_fromunbiasedmappingwithoutumis_July2018.pkl')

#### calculate noise residuals using a generalized additive model

meanplusnoise = irdf[(irdf.smoothednumberofpeaks == 1)
                     & (irdf.number_reads > 100) &
                     (irdf.fraction_canonical > 0.3)][[
                         'wav_stats', 'rnaperdna', 'noisestrengthlogwstd'
                     ]].dropna()

randomgaml = pygam.LinearGAM(pygam.s(0) + pygam.l(1)).gridsearch(
    meanplusnoise[['wav_stats', 'rnaperdna']].values,
    meanplusnoise.noisestrengthlogwstd.values,
    lam=[0.01, 0.1, 1, 5, 10])

gaml = pygam.LinearGAM(pygam.s(0, lam=1, n_splines=10) +
                       pygam.l(1, lam=1)).fit(
                           meanplusnoise[['wav_stats', 'rnaperdna']],
                           meanplusnoise.noisestrengthlogwstd)

pred = gaml.predict(meanplusnoise[['wav_stats', 'rnaperdna']])

meanplusnoise['noisegampred'] = pd.Series(pred, index=meanplusnoise.index)
meanplusnoise['noiseresgam'] = meanplusnoise[
    'noisestrengthlogwstd'] - meanplusnoise['noisegampred']

irdf['noiseresgam'] = meanplusnoise['noiseresgam']
Ejemplo n.º 7
0
plt.xticks([0,0.5,1])
plt.xlabel('fraction of random sets with\nvariance<variance(barcode control set)')
f.savefig('./figures/Fig6/Fig6D_five_bccontrols_vs_randomdistribution_overview.png', \
      dpi = 300, format='png', bbox_inches='tight', frameon=True)




#%%

################
# Overview plots, relationship mean splicing values - noise
################
meanplusnoise=irdf[(irdf.smoothednumberofpeaks==1)&(irdf.number_reads>100)&(irdf.fraction_canonical>0.3)][['wav_stats','rnaperdna','noisestrengthlogwstd']].dropna()

gaml=pygam.LinearGAM(pygam.s(0,lam=1, n_splines=10)+pygam.l(1,lam=1)).fit(meanplusnoise[['wav_stats','rnaperdna']], meanplusnoise.noisestrengthlogwstd)

pred=gaml.predict(meanplusnoise[['wav_stats','rnaperdna']])

meanplusnoise['noisegampred']=pd.Series(pred, index=meanplusnoise.index)
meanplusnoise['noiseresgam']=meanplusnoise['noisestrengthlogwstd']-meanplusnoise['noisegampred']


f=plt.figure(figsize=(4,4))
plt.scatter(meanplusnoise.wav_stats,\
    meanplusnoise.noisestrengthlogwstd, s=10, alpha=0.2, color=sns.xkcd_rgb['medium blue'])
plt.plot(meanplusnoise.wav_stats, meanplusnoise.noisegampred, '.',  color=sns.xkcd_rgb['light green'], alpha=0.2, markersize=5)
plt.xlabel('splicing value')
plt.ylabel('splicing noise strength [log2]')
plt.ylim(-9,3)
plt.xlim(0,7.2)