Esempio n. 1
0
def run_gam_effective_r_from_empirical(state_data,
                                       n_splines=25,
                                       algo=GammaGAM,
                                       n_bootstrap=100):

    # for numerical stability
    epsilon = 1

    R_series = (
        state_data['confirmed_new'] /
        state_data['confirmed_total'].shift(1)).dropna() * 1 / RECOVERY_RATE

    X = np.arange(R_series.shape[0])
    y = R_series.values + epsilon

    # running GAM in bootstrap
    bootstrap = []
    for _ in range(n_bootstrap):

        weights = dirichlet([1] * R_series.shape[0]).rvs(1)

        gam = algo(s(0, n_splines) + l(0))
        gam.fit(X, y, weights=weights[0])

        bootstrap.append(gam)

    preds = pd.DataFrame([m.predict(X) - epsilon for m in bootstrap]).T

    estimate_rt = pd.DataFrame(index=R_series.index)
    estimate_rt['ML'] = preds.mean(axis=1).values
    estimate_rt['Low_90'] = preds.quantile(0.05, axis=1).values
    estimate_rt['High_90'] = preds.quantile(0.95, axis=1).values

    return estimate_rt.dropna()
Esempio n. 2
0
def fit_gam_with_fix_dof(X, Y, dof):  ##{{{
    lam_up = 1e2
    lam_lo = 1e-2
    tol = 1e-2
    diff = 1. + tol
    n_splines = int(dof + 2)
    nit = 0
    while diff > tol:
        lam = (lam_up + lam_lo) / 2.

        gam_model = pg.LinearGAM(
            pg.s(0, n_splines=n_splines, penalties="auto", lam=lam) +
            pg.l(1, penalties=None))
        gam_model.fit(X, Y)
        current_dof = gam_model.statistics_["edof"]
        if current_dof < dof:
            lam_up = lam
        else:
            lam_lo = lam
        diff = np.abs(dof - current_dof)
        nit += 1
        if nit % 100 == 0:
            lam_up = 1e2
            lam_lo = 1e-2
            n_splines += 1
    return gam_model
Esempio n. 3
0
def AAM():

    gam = LinearGAM(s(0, n_splines=25, spline_order=3, constraints='concave', penalties = 'auto', basis = 'cp', edge_knots=[147, 147])
                        + l(3)  # the last travel time
                        + te(0, 1)  # distance and departure_time
                        + te(2, 0)  # distance and isWeekend
                        + l(2),  # isWeekend
                    fit_intercept=True)

    print(gam.gridsearch(X1, y1).summary())
    # print(gam.gridsearch(X1,y1).get_params(deep=True))
    '''plt.scatter(X1[:,0][0:56], y1[0:56], s=3, linewidth=1, label = 'data')
    plt.plot(X1[:,0][0:56], gam.predict(X1[0:56]), color = 'red', linewidth = 1, label = 'prediction')
    plt.legend()
    plt.title('Extended Additive Model')
    plt.show()'''
    # error calculation
    rmse_val = rmse(np.array(y1), np.array(gam.predict(X1)))
    print("RMSE is: "+str(rmse_val))
    mae = mean_absolute_error(y1, gam.predict(X1))
    print("MAE is: "+str(mae))
    mape = mean_absolute_percentage_error(np.array(y1), np.array(gam.predict(X1)))
    print("MAPE is: "+ str(mape))
Esempio n. 4
0
def fit_gam(series, n_splines=25, algo=PoissonGAM, n_bootstrap=100):

    X = np.arange(series.shape[0])
    y = series.values

    # running GAM in bootstrap
    bootstrap = []
    for _ in range(n_bootstrap):

        weights = dirichlet([1] * series.shape[0]).rvs(1)

        gam = algo(s(0, n_splines) + l(0))
        gam.fit(X, y, weights=weights[0])

        bootstrap.append(gam)

    return bootstrap
Esempio n. 5
0
def estimate_gam(series, n_splines=25, algo=PoissonGAM, n_bootstrap=100):

    X = np.arange(series.shape[0])
    y = series.values

    # running GAM in bootstrap
    bootstrap = []
    for _ in range(n_bootstrap):

        weights = dirichlet([1] * series.shape[0]).rvs(1)

        gam = algo(s(0, n_splines) + l(0))
        gam.fit(X, y, weights=weights[0])

        bootstrap.append(gam)

    preds = pd.DataFrame([m.predict(X) for m in bootstrap]).T

    return preds
Esempio n. 6
0
def lingam(term='spline'):
    """
    Method to load unfitted Generalized Additive Models models of
    type modelclass

    INPUT:
    term: 'linear', 'spline' or 'factor'

    RETURN:
    model
    """
    if term is 'linear':
        regmod = LinearGAM(l(0))
    # GAM with spline term
    elif term is 'spline':
        regmod = LinearGAM(s(0))
    # GAM with factor term
    elif term is 'factor':
        regmod = LinearGAM(f(0))
    else:
        raise ValueError('Given Gam term unknown')
    utils.display_get_params('LinearGAM Model Description',
                             regmod.get_params())
    return(regmod)
Esempio n. 7
0
def get_GAM_predictions(Xtrain, Ytrain, Xtest):
    """
    Perform grid search and train Linear GAM model and return predictions for the test set.
    :param Xtrain: X values for training.
    :param Ytrain: Y values for training.
    :param Xtest:  X values for validation.
    :return: Predictions from Linear GAM model for test dataset
    """
    # Create an array of lambda values to search
    lams = np.logspace(-3, 20, 35)
    # GAM search requires numpy arrays
    Xtrain_np = np.array(Xtrain, dtype=np.float64)
    Ytrain_np = np.array(Ytrain, dtype=np.float64)

    # Linear Generalised Additive Model
    model = LinearGAM(
        s(99) + s(100) + l(3) + l(6) + l(8) + l(11) + l(7) + l(9) + l(12) +
        l(10) + l(14) + l(29) + l(15) + l(71) + l(17) + l(21) + l(107) +
        l(16) + l(68) + l(78) + l(61) + l(55) + l(31) + l(13) + l(37) + l(4) +
        l(5) + l(2) + te(4, 5) + te(68, 78)).gridsearch(Xtrain_np,
                                                        Ytrain_np,
                                                        lam=lams)
    return model.predict(Xtest)
Esempio n. 8
0
    def GAM1(self):
        """Generalized Additive Model with possible non-linear effects. Specific
        variables are modelled by splines. Can the possible non-linearities be
        captured by adding polynomial terms to the linear model? Fit such a
        model and comment on the two solutions."""
        from pygam import LinearGAM, s, l, f
        """Non-linear effects are modeled by splines. Analyze the summary table
        and declare which factors should be splined. Do this depending on the
        so-called significance code of the table."""
        terms = l(0)+l(1)+l(2)+l(3)+l(4)+l(5)+l(6)+l(7)+l(8)+l(9)+l(10)+l(11)\
            +l(12)+l(13)+l(14)+l(15)+l(16)+l(17)+l(18)+l(19)+l(20)+l(21)+l(22)\
                +l(23)

        gam = LinearGAM(terms=terms, fit_intercept=False)
        mod = gam.gridsearch(self.Xtrain.values, self.ytrain.values, \
            lam=np.logspace(-3, 3, 11))     # Generate the model
        mod.summary()  # Pseudo-R2: 0.6449
        ypred = mod.predict(self.Xtest)
        MSE1 = np.mean((self.ytest - ypred.reshape(-1, 1))**2).values

        if self.plot:
            plt.plot(ypred.reshape(-1, 1), label='GAM model')
            plt.plot(self.ytest, label='Testing Data')
            plt.legend()
            plt.title("GAM model with linear terms")
            plt.ylabel("FFVC score")
            plt.xlabel("Sample no.")
            plt.show()
        """Repeat the study adding the 'auto' function, adding splines and
        polynomial contributions."""
        gam = LinearGAM(terms='auto', fit_intercept=False)
        mod = gam.gridsearch(self.Xtrain.values, self.ytrain.values, \
            lam=np.logspace(-3, 3, 11))     # Generate the model
        mod.summary()  # Pseudo-R2: 0.6449
        ypred = mod.predict(self.Xtest)
        MSE2 = np.mean((self.ytest - ypred.reshape(-1, 1))**2).values

        if self.plot:
            plt.plot(ypred.reshape(-1, 1), label='GAM model')
            plt.plot(self.ytest, label='Testing Data')
            plt.legend()
            plt.title("GAM model with spline terms")
            plt.ylabel("FFVC score")
            plt.xlabel("Sample no.")
            plt.show()

        print(f"Linear GAM produced MSE={MSE1},"+"\n"\
            f"Spline addition produced MSE={MSE2}")
        """Save these values for Exercise 7."""
        self.GAM1E1P5 = MSE1[0]
        self.GAM2E1P5 = MSE2[0]

        return 1
Esempio n. 9
0
plt.xticks([0,0.5,1])
plt.xlabel('fraction of random sets with\nvariance<variance(barcode control set)')
f.savefig('./figures/Fig6/Fig6D_five_bccontrols_vs_randomdistribution_overview.png', \
      dpi = 300, format='png', bbox_inches='tight', frameon=True)




#%%

################
# Overview plots, relationship mean splicing values - noise
################
meanplusnoise=irdf[(irdf.smoothednumberofpeaks==1)&(irdf.number_reads>100)&(irdf.fraction_canonical>0.3)][['wav_stats','rnaperdna','noisestrengthlogwstd']].dropna()

gaml=pygam.LinearGAM(pygam.s(0,lam=1, n_splines=10)+pygam.l(1,lam=1)).fit(meanplusnoise[['wav_stats','rnaperdna']], meanplusnoise.noisestrengthlogwstd)

pred=gaml.predict(meanplusnoise[['wav_stats','rnaperdna']])

meanplusnoise['noisegampred']=pd.Series(pred, index=meanplusnoise.index)
meanplusnoise['noiseresgam']=meanplusnoise['noisestrengthlogwstd']-meanplusnoise['noisegampred']


f=plt.figure(figsize=(4,4))
plt.scatter(meanplusnoise.wav_stats,\
    meanplusnoise.noisestrengthlogwstd, s=10, alpha=0.2, color=sns.xkcd_rgb['medium blue'])
plt.plot(meanplusnoise.wav_stats, meanplusnoise.noisegampred, '.',  color=sns.xkcd_rgb['light green'], alpha=0.2, markersize=5)
plt.xlabel('splicing value')
plt.ylabel('splicing noise strength [log2]')
plt.ylim(-9,3)
plt.xlim(0,7.2)
Esempio n. 10
0
def gam_decomposition_old_old_old(Xd,
                                  Enat,
                                  Sigma=None,
                                  time_center=None,
                                  gam_dof=7,
                                  verbose=False):  ##{{{
    """
	NSSEA.gam_decomposition
	=======================
	Perform the decomposition anthropic/natural forcing with GAM
	
	arguments
	---------
	"""
    models = Xd.columns.to_list()
    n_models = Xd.shape[1]
    n_sample = Enat.shape[1] - 1
    time = Xd.index.values
    n_time = time.size
    time_l = np.repeat(time[0], n_time)
    Eant = np.repeat(0., n_time)

    sample = ["be"] + ["S{}".format(i) for i in range(n_sample)]
    X = xr.DataArray(np.zeros((n_time, n_sample + 1, 3, n_models)),
                     coords=[time, sample, ["all", "nat", "ant"], models],
                     dims=["time", "sample", "forcing", "models"])

    pb = ProgressBar("GAM decomposition", n_models * n_sample)
    for i in range(n_models):
        gam_model = pg.LinearGAM(
            pg.s(0, n_splines=gam_dof - 2, penalties=None) +
            pg.l(1, penalties=None))
        gam_model.fit(np.stack((time, Enat.values[:, 0]), -1), Xd.values[:, i])

        X.values[:, 0, 0,
                 i] = gam_model.predict(np.stack((time, Enat.values[:, 0]),
                                                 -1))
        X.values[:, 0, 1, i] = gam_model.predict(
            np.stack((time_l, Enat.values[:, 0]), -1))
        X.values[:, 0, 2, i] = gam_model.predict(np.stack((time, Eant), -1))

        for j in range(n_sample):
            if verbose: pb.print()

            Xl = Enat.values[:, j + 1]
            mVt = np.stack((time, Xl), -1)
            mVl = np.stack((time_l, Xl), -1)

            ## GAM decomposition
            gam_model = pg.LinearGAM(
                pg.s(0, n_splines=gam_dof - 2, penalties=None) +
                pg.l(1, penalties=None))
            gam_model.fit(mVt, Xd.values[:, i])

            ## Coefficients of decomposition
            int_coef = gam_model.coef_[-1]
            lin_coef = gam_model.coef_[-2]
            spl_coef = gam_model.coef_[:-2]

            spl_mat = gam_model._modelmat(mVt).todense()[:, :-2]
            proj_mat = spl_mat @ np.linalg.inv(spl_mat.T @ spl_mat) @ spl_mat.T

            ## Noise of linear term
            sigma_lin = np.sqrt(
                (Xl.transpose() @ Sigma @ Xl) / (Xl.transpose() @ Xl)**2)
            noise_lin = np.random.normal(loc=0, scale=sigma_lin)

            ## Noise of spline term
            std_spl = matrix_squareroot(
                matrix_positive_part(proj_mat.transpose() @ Sigma @ proj_mat))
            noise_spl = np.ravel(std_spl @ np.random.normal(
                loc=0, scale=1, size=time.size).reshape((n_time, 1)))
            noise_spl = noise_spl - noise_spl[0]

            ## Final decomposition
            gam_model.coef_[-2] += noise_lin
            X.values[:, j + 1, 0, i] = gam_model.predict(mVt) + noise_spl
            X.values[:, j + 1, 1, i] = gam_model.predict(mVl)
            X.values[:, j + 1, 2,
                     i] = gam_model.predict(np.stack(
                         (time, Eant), -1)) + noise_spl

    if time_center is not None:
        X_event = X.loc[time_center, :, "all", :]
        X_center = X - X_event

    if verbose: pb.end()

    return XSplitted(X, X_event, X_center)
Esempio n. 11
0
def gam_decomposition_classic(lX,
                              Enat,
                              Sigma=None,
                              time_center=None,
                              n_splines=None,
                              gam_lam=None,
                              verbose=False):  ##{{{
    """
	NSSEA.gam_decomposition
	=======================
	Perform the decomposition anthropic/natural forcing with GAM
	
	arguments
	---------
	"""
    models = [lx.columns[0] for lx in lX]
    n_models = len(models)
    n_sample = Enat.shape[1] - 1
    time = np.unique(lX[0].index)
    n_time = time.size
    time_l = np.repeat(time[0], n_time)
    Xa = np.repeat(0., n_time)

    sample = ["be"] + ["S{}".format(i) for i in range(n_sample)]
    X = xr.DataArray(np.zeros((n_time, n_sample + 1, 3, n_models)),
                     coords=[time, sample, ["all", "nat", "ant"], models],
                     dims=["time", "sample", "forcing", "models"])

    spl_pen = "auto"
    lin_pen = None

    if n_splines is None:
        n_splines = 8
    if gam_lam is None:
        gam_lam = 0.6

    pb = ProgressBar("GAM decomposition", n_models * n_sample)
    for i in range(n_models):

        Xl = Enat.values[:, 0]
        x_all = np.stack((time, Xl), -1)
        x_nat = np.stack((time_l, Xl), -1)

        ## GAM decomposition
        gam_model = pg.LinearGAM(
            pg.s(0, n_splines=n_splines, penalties=spl_pen, lam=gam_lam) +
            pg.l(1, penalties=lin_pen))
        #		gam_model = pg.LinearGAM( pg.s( 0 , n_splines = gam_dof - 2 , penalties = spl_pen , lam = 0.9 ) + pg.l( 1 , penalties = lin_pen ) )
        #		gam_model = pg.LinearGAM( pg.s( 0 , n_splines = gam_dof - 2 , penalties = spl_pen ) + pg.l( 1 , penalties = lin_pen ) )
        gam_model.fit(
            np.stack((lX[i].index, Enat.loc[lX[i].index, 0].values), -1),
            lX[i].values)

        X.values[:, 0, 0, i] = gam_model.predict(x_all)
        X.values[:, 0, 1, i] = gam_model.predict(x_nat)

        mean_coef = gam_model.coef_
        cov_coef = gam_model.statistics_["cov"]

        for j in range(n_sample):
            if verbose: pb.print()

            Xl = Enat.values[:, j + 1]
            x_all = np.stack((time, Xl), -1)
            x_nat = np.stack((time_l, Xl), -1)

            ## Perturbation
            gam_model.coef_ = np.random.multivariate_normal(mean=mean_coef,
                                                            cov=cov_coef,
                                                            size=1).ravel()

            ## Final decomposition
            X.values[:, j + 1, 0, i] = gam_model.predict(x_all)
            X.values[:, j + 1, 1, i] = gam_model.predict(x_nat)

    X.loc[:, :, "ant", :] = X.loc[:, :, "all", :] - X.loc[:, :, "nat", :]

    if time_center is not None:
        X_event = X.loc[time_center, :, "all", :]
        X_center = X - X_event

    if verbose: pb.end()

    return XSplitted(X, X_event, X_center)
irdf['noiseres_linear']=irdf.index.map(lambda x: irdf.noisestrengthlogwstd[x]- \
       slope*irdf.wav_stats[x] + const if (irdf.wav_stats[x]>0)& \
    (irdf.wav_stats[x]<8)&(irdf.number_reads[x]>100)&(irdf.smoothednumberofpeaks[x]==1)&(irdf.fraction_canonical[x]>0.3) else np.nan)

#########
#irdfold=pd.read_pickle(martin + 'combined_analysis/dataframes/irdf_corrected_fromunbiasedmappingwithoutumis_July2018.pkl')

#### calculate noise residuals using a generalized additive model

meanplusnoise = irdf[(irdf.smoothednumberofpeaks == 1)
                     & (irdf.number_reads > 100) &
                     (irdf.fraction_canonical > 0.3)][[
                         'wav_stats', 'rnaperdna', 'noisestrengthlogwstd'
                     ]].dropna()

randomgaml = pygam.LinearGAM(pygam.s(0) + pygam.l(1)).gridsearch(
    meanplusnoise[['wav_stats', 'rnaperdna']].values,
    meanplusnoise.noisestrengthlogwstd.values,
    lam=[0.01, 0.1, 1, 5, 10])

gaml = pygam.LinearGAM(pygam.s(0, lam=1, n_splines=10) +
                       pygam.l(1, lam=1)).fit(
                           meanplusnoise[['wav_stats', 'rnaperdna']],
                           meanplusnoise.noisestrengthlogwstd)

pred = gaml.predict(meanplusnoise[['wav_stats', 'rnaperdna']])

meanplusnoise['noisegampred'] = pd.Series(pred, index=meanplusnoise.index)
meanplusnoise['noiseresgam'] = meanplusnoise[
    'noisestrengthlogwstd'] - meanplusnoise['noisegampred']
Esempio n. 13
0
               return_type='dataframe')
sp8t = dmatrix("bs(xtest['Enroll'], df=6, include_intercept=False)",
               {"xtest['Enroll']": xtest['Enroll']},
               return_type='dataframe')

x2test = pd.concat([
    xtest['Private01'], xtest['Room.Board'], xtest['perc.alumni'],
    xtest['Grad.Rate'], sp3t, sp4t, sp6t, sp7t, sp8t
],
                   axis=1)
ypred = fit2.predict(sm.add_constant(x2test))
print('GAM MSE: %.2f' % mean_squared_error(ytest, ypred))  #3876188.82

# (d)
gam = LinearGAM(
    l(0) + l(1) + s(2, n_splines=6) + f(3) + s(4, n_splines=6) + l(5) +
    s(6, n_splines=6) + s(7, n_splines=6) + s(8, n_splines=6)).fit(x1, ytrain)

for i, term in enumerate(gam.terms):
    if term.isintercept:
        continue

    XX = gam.generate_X_grid(term=i)
    pdep, confi = gam.partial_dependence(term=i, X=XX, width=0.95)

    plt.figure()
    plt.plot(XX[:, term.feature], pdep)
    plt.plot(XX[:, term.feature], confi, c='r', ls='--')
    plt.title(repr(term))
    plt.show()
# Non-linear: Expand, Personal, Accept.
Esempio n. 14
0
all_permut_preds_alphas=np.empty([1000,1])
# feature weights from the real model will be stored in this array
featureWeights_AI=np.empty([100,data_AI.shape[1]-3])
# adding MSE quantification
mse_AI=np.empty([100,1])
# run real predictions 100 times. Allows for each subject to be randomly allocated to the testing third multiple times.
for split in range(0,100):
# for a few different train and test splits
	# Train and test split from data frame
	xtrain_AI,xtest_AI,ytrain_AI,ytest_AI,indices_train_AI,indices_test_AI=train_test_split(Featvecs_AI,varofintAI,indices,test_size=0.33,random_state=(split))
	# make dataframe of non-brain variables to regress covariates from EF in training
	df=np.array([age[indices_train_AI],mot[indices_train_AI],varofintAI[indices_train_AI]])
	# transpose so subjects are rows
	dft=np.transpose(df)
	# regress covariates from EF in training sample (Linear GAM still has spline term)
	GAMFit=LinearGAM(s(0,n_splines=5) + l(1)).fit(dft,dft[:,2])
	# get residuals
	residsvec=GAMFit.deviance_residuals(dft[:,[0,1,2]],dft[:,2])
	# set ytrain to residuals
	ytrain_AI=residsvec
	# make equivalent dataframe for testing sample, but fit age and motion effects from training model
	df2=np.array([age[indices_test_AI],mot[indices_test_AI],varofintAI[indices_test_AI]])
	df2t=np.transpose(df2)
	# apply model to unseen data to get those residuals for testing set
	testResidsvec=GAMFit.deviance_residuals(df2t[:,[0,1,2]],df2t[:,2])
	# replace y test with age/motion controlled EF
	ytest_AI=testResidsvec
	# fit model with gcv
	lm_AI = sklearn.linear_model.RidgeCV(alphas=alphas, store_cv_values=True).fit(xtrain_AI,ytrain_AI)
	# set prediction alpha to best performing alpha in training set
	alpha_AI=lm_AI.alpha_
    def fit(
        self,
        X: pd.DataFrame,
        Y: pd.DataFrame,
    ):
        """
        Fit a poisson regression model each for the cases using active_cases and percentage_susceptible at time t-1, and another model
        for removed using active_cases at time t-1.

        Args:
            X (pd.DataFrame): Dataframe for given region of predictor variables containing columns date, province, active_cases, percent_susceptible,
                              and all columns for provinces for {province_name}_active_cases_yesterday, {province_name}_percent_susceptible_yesterday,
                              as well as all log features
            Y (pd.DataFrame): Dataframe for given region of response variables containing columns date, province, cases, removed
        """
        self.X_original = X.copy()
        self.Y_original = Y.copy()
        self.provinces = X["province"].unique()

        # Fit model for each province
        self.X_cases = {}
        self.Y_cases = {}
        self.X_removed = {}
        self.Y_removed = {}
        self.poisson_gam_cases = {}
        self.poisson_gam_removed = {}

        for province in self.provinces:
            # Remove extra columns for given province in form {province}_column_name
            cols_drop = X.filter(regex=province, axis=1).columns
            X_province = X.query(f"province == '{province}'").drop(cols_drop,
                                                                   axis=1)
            Y_province = Y.query(f"province == '{province}'")

            # Store case dataframe used to train model for each province
            self.X_cases[province] = X_province.filter(
                regex=
                r"(log_active_cases_yesterday|log_percent_susceptible_yesterday)"
            )
            self.Y_cases[province] = Y_province["cases"]

            # Add terms for each province I_t-1 and Z_t-1. Either splines or linear terms
            if self.use_splines:
                terms = s(0, lam=self.lam_main) + s(1, lam=self.lam_main)
                for i in range(1, len(self.provinces)):
                    terms += s(i * 2, lam=self.lam_other) + s(
                        i * 2 + 1, lam=self.lam_other)
            else:
                terms = l(0, lam=self.lam_main) + l(1, lam=self.lam_other)
                for i in range(1, len(self.provinces)):
                    terms += l(i * 2, lam=self.lam_other) + l(
                        i * 2 + 1, lam=self.lam_other)

            # Fit cases model for province
            cases_model = PoissonGAM(terms, verbose=self.verbose)
            cases_model.fit(self.X_cases[province], self.Y_cases[province])
            self.poisson_gam_cases[province] = cases_model

            # Store remove dataframe used to train model for each province
            self.X_removed[province] = X_province.filter(
                regex=r"log_active_cases_yesterday")
            self.Y_removed[province] = Y_province["removed"]

            # Add terms for each province I_t-1
            terms = l(0, lam=self.lam_main)
            for i in range(1, len(self.provinces)):
                terms += l(i, lam=self.lam_other)

            # Fit removed model for each province
            removed_model = PoissonGAM(terms, verbose=self.verbose)
            removed_model.fit(self.X_removed[province], self.Y_cases[province])
            self.poisson_gam_removed[province] = removed_model

        return
fig, ax = plt.subplots(figsize=(5, 5))
partialResidualPlot(result_spline, house_98105, 'AdjSalePrice',
                    'SqFtTotLiving', ax)

plt.tight_layout()
plt.show()

### Generalized Additive Models

predictors = ['SqFtTotLiving', 'SqFtLot', 'Bathrooms', 'Bedrooms', 'BldgGrade']
outcome = 'AdjSalePrice'
X = house_98105[predictors].values
y = house_98105[outcome]

## model
gam = LinearGAM(s(0, n_splines=12) + l(1) + l(2) + l(3) + l(4))
gam.gridsearch(X, y)
print(gam.summary())

fig, axes = plt.subplots(figsize=(8, 8), ncols=2, nrows=3)

titles = ['SqFtTotLiving', 'SqFtLot', 'Bathrooms', 'Bedrooms', 'BldgGrade']
for i, title in enumerate(titles):
    ax = axes[i // 2, i % 2]
    XX = gam.generate_X_grid(term=i)
    ax.plot(XX[:, i], gam.partial_dependence(term=i, X=XX))
    ax.plot(XX[:, i],
            gam.partial_dependence(term=i, X=XX, width=.95)[1],
            c='r',
            ls='--')
    ax.set_title(titles[i])
Esempio n. 17
0
#prep X and y

features = ['year', 'age', 'education']

X = df[features].values
crude_strCat_to_int(X, 2)
y = df['wage'].values

# test different types of term on categorical feature
# term types: spline (default), linear effect, factor, spline with categorical dtype

from pygam import LinearGAM, s, f, l

gam1 = LinearGAM(s(0) + s(1) + s(2)).fit(X, y)
gam2 = LinearGAM(s(0) + s(1) + l(2)).fit(X, y)
gam3 = LinearGAM(s(0) + s(1) + f(2)).fit(X, y)
gam4 = LinearGAM(s(0) + s(1) + s(2, dtype='categorical')).fit(X, y)

gams = [gam1, gam2, gam3, gam4]
terms_names = ['spline', 'linear', 'factor', 'categorical spline']

###########################################
#compare pdp

import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = (28, 28)
fig, axs = plt.subplots(4, X.shape[1], sharey='row')

for r, axr in enumerate(axs):