コード例 #1
0
def create_linear_model(X_train, X_test, Y_train, Y_test):
    ''' TODO...
        - Predict the wine quality using the test set and compare the accuracy to the actual quality. Comment.
        - Print the parameter estimates and their 95% confidence intervals in a single table. (Suggest using
          confint()), and cbind()
    '''

    X_train = add_constant(X_train)
    regressionResult = OLS(Y_train, X_train).fit()
    print(regressionResult.summary())

    # Print various attributes of the OLS fitted model
    # print("R Squared: {}".format(regressionResult.rsquared))
    # print("SSE: {}".format(regressionResult.ess))
    # print("SSR: {}".format(regressionResult.ssr))
    # print("Residual MSE: {}".format(regressionResult.mse_resid))
    # print("Total MSE: {}".format(regressionResult.mse_total))
    # print("Model MSE: {}".format(regressionResult.mse_model))
    # print("F-Value: {}".format(regressionResult.mse_model/regressionResult.mse_resid))
    # print("NOBS: {}".format(regressionResult.nobs))
    # print("Centered TSS: {}".format(regressionResult.centered_tss))
    # print("Uncentered TSS: {}".format(regressionResult.uncentered_tss))
    # print("DF Model: {}".format(regressionResult.df_model))
    # print("DF Resid: {}".format(regressionResult.df_resid))
    # print("Standard Errors: {}".format(regressionResult.bse))
    print("Confidence: {}".format(regressionResult.conf_int()))

    predictions = regressionResult.predict(X_train)

    nobs, p = X_train.shape
    eaic = extractAIC(nobs, p, Y_train, predictions)
    print("Extract AIC: {}".format(eaic))

    params = regressionResult.params

    # n, p = X_test.shape
    # X_test = add_constant(X_test)
    # predictions = X_test.dot(params).reshape(n,1)

    # num_matches = 0
    # for i in range(len(Y_test)):
    #     p = int(round(predictions[i][0], 0))
    #     is_match = (Y_test[i] == p)

    #     if is_match:
    #         num_matches += 1

    #     print("Actual: {}, Predictions: {}... Match: {}".format(Y_test[i], p, is_match))

    # print("Number of matches: {}, Total number of Instances: {}".format(num_matches, n))
    # print("Percent correct guesses: {}%".format(round((num_matches/n)*100, 3)))

    return params
コード例 #2
0
def local_fdr(zscores, null_proportion=1.0, null_pdf=None, deg=7, nbins=30):
    """
    Calculate local FDR values for a list of Z-scores.

    Parameters
    ----------
    zscores : array-like
        A vector of Z-scores
    null_proportion : float
        The assumed proportion of true null hypotheses
    null_pdf : function mapping reals to positive reals
        The density of null Z-scores; if None, use standard normal
    deg : integer
        The maximum exponent in the polynomial expansion of the
        density of non-null Z-scores
    nbins : integer
        The number of bins for estimating the marginal density
        of Z-scores.

    Returns
    -------
    fdr : array-like
        A vector of FDR values

    References
    ----------
    B Efron (2008).  Microarrays, Empirical Bayes, and the Two-Groups
    Model.  Statistical Science 23:1, 1-22.

    Examples
    --------
    Basic use (the null Z-scores are taken to be standard normal):

    >>> from statsmodels.stats.multitest import local_fdr
    >>> import numpy as np
    >>> zscores = np.random.randn(30)
    >>> fdr = local_fdr(zscores)

    Use a Gaussian null distribution estimated from the data:

    >>> null = EmpiricalNull(zscores)
    >>> fdr = local_fdr(zscores, null_pdf=null.pdf)
    """

    from statsmodels.genmod.generalized_linear_model import GLM
    from statsmodels.genmod.generalized_linear_model import families
    from statsmodels.regression.linear_model import OLS

    # Bins for Poisson modeling of the marginal Z-score density
    minz = min(zscores)
    maxz = max(zscores)
    bins = np.linspace(minz, maxz, nbins)

    # Bin counts
    zhist = np.histogram(zscores, bins)[0]

    # Bin centers
    zbins = (bins[:-1] + bins[1:]) / 2

    # The design matrix at bin centers
    dmat = np.vander(zbins, deg + 1)

    # Use this to get starting values for Poisson regression
    md = OLS(np.log(1 + zhist), dmat).fit()

    # Poisson regression
    md = GLM(zhist, dmat,
             family=families.Poisson()).fit(start_params=md.params)

    # The design matrix for all Z-scores
    dmat_full = np.vander(zscores, deg + 1)

    # The height of the estimated marginal density of Z-scores,
    # evaluated at every observed Z-score.
    fz = md.predict(dmat_full) / (len(zscores) * (bins[1] - bins[0]))

    # The null density.
    if null_pdf is None:
        f0 = np.exp(-0.5 * zscores**2) / np.sqrt(2 * np.pi)
    else:
        f0 = null_pdf(zscores)

    # The local FDR values
    fdr = null_proportion * f0 / fz

    fdr = np.clip(fdr, 0, 1)

    return fdr
コード例 #3
0
def local_fdr(zscores, null_proportion=1.0, null_pdf=None, deg=7,
              nbins=30):
    """
    Calculate local FDR values for a list of Z-scores.

    Parameters
    ----------
    zscores : array-like
        A vector of Z-scores
    null_proportion : float
        The assumed proportion of true null hypotheses
    null_pdf : function mapping reals to positive reals
        The density of null Z-scores; if None, use standard normal
    deg : integer
        The maximum exponent in the polynomial expansion of the
        density of non-null Z-scores
    nbins : integer
        The number of bins for estimating the marginal density
        of Z-scores.

    Returns
    -------
    fdr : array-like
        A vector of FDR values

    References
    ----------
    B Efron (2008).  Microarrays, Empirical Bayes, and the Two-Groups
    Model.  Statistical Science 23:1, 1-22.

    Examples
    --------
    Basic use (the null Z-scores are taken to be standard normal):

    >>> fdr = local_fdr(zscores)

    Use a Gaussian null distribution estimated from the data:

    >>> null = EmpiricalNull(zscores)
    >>> fdr = local_fdr(zscores, null_pdf=null.pdf)
    """

    from statsmodels.genmod.generalized_linear_model import GLM
    from statsmodels.genmod.generalized_linear_model import families
    from statsmodels.regression.linear_model import OLS

    # Bins for Poisson modeling of the marginal Z-score density
    minz = min(zscores)
    maxz = max(zscores)
    bins = np.linspace(minz, maxz, nbins)

    # Bin counts
    zhist = np.histogram(zscores, bins)[0]

    # Bin centers
    zbins = (bins[:-1] + bins[1:]) / 2

    # The design matrix at bin centers
    dmat = np.vander(zbins, deg + 1)

    # Use this to get starting values for Poisson regression
    md = OLS(np.log(1 + zhist), dmat).fit()

    # Poisson regression
    md = GLM(zhist, dmat, family=families.Poisson()).fit(start_params=md.params)

    # The design matrix for all Z-scores
    dmat_full = np.vander(zscores, deg + 1)

    # The height of the estimated marginal density of Z-scores,
    # evaluated at every observed Z-score.
    fz = md.predict(dmat_full) / (len(zscores) * (bins[1] - bins[0]))

    # The null density.
    if null_pdf is None:
        f0 = np.exp(-0.5 * zscores**2) / np.sqrt(2 * np.pi)
    else:
        f0 = null_pdf(zscores)

    # The local FDR values
    fdr = null_proportion * f0 / fz

    fdr = np.clip(fdr, 0, 1)

    return fdr
コード例 #4
0
    def fit_ols(self):

        self.data_lag.loc[self.data_lag.fecha <= "2020-04-04", "days"] = 30
        ts_ols = OLS(
            self.data_lag.iloc[:-1, ].fallecimientos,
            self.data_lag.iloc[:-1, ].drop(["fecha", "fallecimientos"],
                                           axis=1)).fit()
        sum = ts_ols.summary()
        predictions = pd.DataFrame(
            ts_ols.predict(self.forecast.drop("fecha", axis=1)))

        e = pd.DataFrame({
            "Modelo":
            "OLS",
            "Predicción de hoy": [predictions.iloc[0, 0]],
            "Error de hoy": [
                abs(predictions.iloc[0, 0] -
                    self.dt.loc[len(self.dt) - 1, "fallecimientos"])
            ]
        })

        predictions["fecha"] = self.dt.loc[len(self.dt) - 1, "fecha"]
        predictions.columns = ["fallecimientos", "fecha"]
        predictions.reset_index(drop=True, inplace=True)
        for i in range(len(self.forecast)):
            c = 0
            c += i
            predictions.loc[i,
                            "fecha"] = predictions.fecha[i] + timedelta(days=c)

        new = pd.concat(
            (self.dt[["fallecimientos", "fecha"]], predictions.iloc[1:, :]),
            axis=0)

        new["Predicciones"] = np.where(
            new.fecha <= self.dt.loc[len(self.dt) - 1, "fecha"], "Real",
            "Pred")

        fig = px.bar(
            new,
            x="fecha",
            y="fallecimientos",
            color="Predicciones",
        )

        # predictions.columns =["Predicciones_Fallecimientos", "fecha"]
        #
        # load = str(self.dt.loc[len(self.dt)-1, "fecha"] - timedelta(days=1))
        # load = load[0:10] + ".pkl"
        #
        # with open(load, "rb") as file:
        #     historic = pickle.load(file)
        # predictions["Error"] = 0
        # p=pd.concat([predictions.reset_index(drop=True), historic], ignore_index=True)
        # p = p.loc[p.fecha <= self.dt.loc[len(self.dt)-1, "fecha"],:]
        # p.reset_index(drop=True, inplace=True)
        # for i in range(0,len(p)):
        #     if self.dt.loc[len(self.dt)-1,"fecha"] == p.loc[i,"fecha"]:
        #         p.loc[i,"Error"] = np.sqrt((self.dt.loc[len(self.dt)-1,"fallecimientos"] - p.loc[i,"Predicciones_Fallecimientos"])**2)
        #
        # save = str(self.dt.loc[len(self.dt)-1, "fecha"])
        # save = save[0:10] + ".pkl"
        #
        # with open(save, "wb") as file:
        #     pickle.dump(p, file)

        return e, fig, sum
コード例 #5
0
def ols_sm(X_train, y_train, X_test):
    X_train = sm.add_constant(
        X_train)  # adds col of ones for intercept coefficient in OLS model
    ols = OLS(y_train, X_train).fit()
    # with open('ols_model_summary.csv', 'w') as f:
    #     f.write(ols.summary().as_csv())
    with open('ols_model_summary.txt', 'w') as f:
        f.write(ols.summary().as_text())

    # Plot True vs Predicted values to examine if linear model is a good fit
    fig = plt.figure(figsize=(12, 8))
    X_test = sm.add_constant(X_test)
    plt.scatter(y_test, ols.predict(X_test))
    plt.xlabel('True values')
    plt.ylabel('Predicted values')
    plt.title('True vs Predicted values')
    plt.show()
    plt.close()
    # Add quadratic term to X or take log of y to improve

    # Discern if a linear relationship exists with partial regression plots
    fig = plt.figure(figsize=(12, 8))
    fig = sm.graphics.plot_partregress_grid(ols, fig=fig)
    plt.title('Partial Regression Plots')
    plt.show()
    plt.close()

    # Identify outliers and high leverage points
    # a. Identify outliers (typically, those data points with studentized residuals outside of +/- 3 stdev).
    # Temporarily remove these from your data set and re-run your model.
    # Do your model metrics improve considerably? Does this give you cause for more confidence in your model?
    # b. Identify those outliers that are also high-leverage points (high residual and high leverage --> high influence).
    fig, ax = plt.subplots(figsize=(12, 8))
    fig = sm.graphics.influence_plot(ols, ax=ax, criterion="cooks")
    plt.show()
    fig, ax = plt.subplots(figsize=(8, 6))
    fig = sm.graphics.plot_leverage_resid2(ols, ax=ax)
    plt.show()
    plt.close()

    # Confirm homoscedasticity (i.e., constant variance of residual terms)
    # If residuals exhibit a “funnel shaped” effect, consider transforming your data into logarithmic space.
    studentized_residuals = ols.outlier_test()[:, 0]
    y_pred = ols.fittedvalues
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(y_pred, studentized_residuals)
    ax.axhline(y=0.0, color='k', ls='--')
    ax.set_xlabel('Predicted y')
    ax.set_ylabel('Studentized Residuals')
    plt.show()
    plt.close()

    # Test if residuals are normally distributed in QQ plot
    # plots quantile of the normal distribution against studentized residuals
    # if sample quantiles are normally distributed, the dots will align with 45 deg line
    fig, ax = plt.subplots()
    sm.graphics.qqplot(studentized_residuals, fit=True, line='45', ax=ax)
    plt.show()
    plt.close()

    # Find influencial points in data
    # DFBETAS - standardized measure of how much each coefficient changes when that observation is left out
    threshold = 2. / len(X_train)**.5
    infl = ols.get_influence()
    df = pd.DataFrame(infl.summary_frame().filter(regex="dfb"))
    inf = df[df > threshold].dropna(axis=0, how='all')
    print('Influencial points:\n', inf)
コード例 #6
0
    y_true = np.dot(exog, beta)
    y = y_true + sig_e * np.random.normal(size=nobs)
    endog = y

    print "DGP"
    print "nobs=%d, beta=%r, sig_e=%3.1f" % (nobs, beta, sig_e)

    mod_ols = OLS(endog, exog[:, :2])
    res_ols = mod_ols.fit()
    #'cv_ls'[1000, 0.5][0.01, 0.45]
    tst = smke.TestFForm(
        endog,
        exog[:, :2],
        bw=[0.01, 0.45],
        var_type="cc",
        fform=lambda x, p: mod_ols.predict(p, x),
        estimator=lambda y, x: OLS(y, x).fit().params,
        nboot=1000,
    )

    print "bw", tst.bw
    print "tst.test_stat", tst.test_stat
    print tst.sig
    print "tst.boots_results mean, min, max", (
        tst.boots_results.mean(),
        tst.boots_results.min(),
        tst.boots_results.max(),
    )
    print "lower tail bootstrap p-value", (tst.boots_results < tst.test_stat).mean()
    print "upper tail bootstrap p-value", (tst.boots_results >= tst.test_stat).mean()
    from scipy import stats
コード例 #7
0
import pandas as pd
from statsmodels.regression.linear_model import OLS
import numpy as np
np.set_printoptions(suppress=True)

data = pd.read_csv('Dataset/dataset.csv')

X = data["Head Size(cm^3)"].values
y = data["Brain Weight(grams)"].values

X = np.array(X, dtype='float64')
y = np.array(y, dtype='float64')
y = np.reshape(y, (len(y), 1))

X = np.column_stack([np.ones(len(X)), X])

# Implement the statsmodel function

res = OLS(y, X).fit()

# Theta values
theta = res.params

print(theta)

# prediction
ols_pred = res.predict()

print(res.summary())
コード例 #8
0
    order = 3
    exog = x**np.arange(order + 1)
    beta = np.array([1, 1, 0.1, 0.0])[:order+1] # 1. / np.arange(1, order + 2)
    y_true = np.dot(exog, beta)
    y = y_true + sig_e * np.random.normal(size=nobs)
    endog = y

    print('DGP')
    print('nobs=%d, beta=%r, sig_e=%3.1f' % (nobs, beta, sig_e))

    mod_ols = OLS(endog, exog[:,:2])
    res_ols = mod_ols.fit()
    #'cv_ls'[1000, 0.5][0.01, 0.45]
    tst = smke.TestFForm(endog, exog[:,:2], bw=[0.01, 0.45], var_type='cc',
                         fform=lambda x,p: mod_ols.predict(p,x),
                         estimator=lambda y,x: OLS(y,x).fit().params,
                         nboot=1000)

    print('bw', tst.bw)
    print('tst.test_stat', tst.test_stat)
    print(tst.sig)
    print('tst.boots_results mean, min, max', (tst.boots_results.mean(),
                                               tst.boots_results.min(),
                                               tst.boots_results.max()))
    print('lower tail bootstrap p-value', (tst.boots_results < tst.test_stat).mean())
    print('upper tail bootstrap p-value', (tst.boots_results >= tst.test_stat).mean())
    from scipy import stats
    print('aymp.normal p-value (2-sided)', stats.norm.sf(np.abs(tst.test_stat))*2)
    print('aymp.normal p-value (upper)', stats.norm.sf(tst.test_stat))
コード例 #9
0
ファイル: test_ols.py プロジェクト: biocore/gneiss
class TestOLS(unittest.TestCase):
    """ Tests OLS regression with refactored matrix multiplication. """
    def setUp(self):
        np.random.seed(0)
        b01, b11, b21 = 1, 2, -3
        b02, b12, b22 = 2, -1, 4
        n = 50
        x1 = np.linspace(0, 10, n)
        x2 = np.linspace(10, 15, n)
        e = np.random.normal(size=n) * 10
        y1 = b01 + b11 * x1 + b21 * x2 + e
        e = np.random.normal(size=n) * 10
        y2 = b02 + b12 * x1 + b22 * x2 + e
        Y = pd.DataFrame(np.vstack((y1, y2)).T, columns=['y1', 'y2'])

        B = pd.DataFrame([[b01, b11, b21], [b02, b12, b22]])

        X = pd.DataFrame(np.vstack((np.ones(n), x1, x2)).T,
                         columns=['Intercept', 'x1', 'x2'])

        self.Y = Y
        self.B = B
        self.X = X
        self.r1_ = OLS(endog=y1, exog=X).fit()
        self.r2_ = OLS(endog=y2, exog=X).fit()
        self.tree = TreeNode.read(['(c, (b,a)y2)y1;'])

        self.results = "results"
        if not os.path.exists(self.results):
            os.mkdir(self.results)

    def tearDown(self):
        shutil.rmtree(self.results)

    def test_ols_immutable(self):
        # test to see if values in table get filtered out.
        # and that the original table doesn't change
        table = self.Y
        x = pd.DataFrame(self.X.values,
                         columns=self.X.columns,
                         index=range(100, 100 + len(self.X.index)))
        metadata = pd.concat((self.X, x))

        exp_metadata = metadata.copy()
        ols('x1 + x2', self.Y, self.X)
        self.assertEqual(str(table), str(self.Y))
        self.assertEqual(str(metadata), str(exp_metadata))

    def test_ols_missing_metadata(self):
        # test to see if values in table get filtered out.
        # and that the original table doesn't change
        table = self.Y
        y = pd.DataFrame(self.Y.values,
                         columns=self.Y.columns,
                         index=range(100, 100 + len(self.Y.index)))

        table = pd.concat((self.Y, y))
        ids = np.arange(100, 100 + len(self.X.index))
        x = pd.DataFrame([[np.nan] * len(self.X.columns)] * len(ids),
                         columns=self.X.columns,
                         index=ids)

        metadata = pd.concat((self.X, x))
        model = ols('x1 + x2', table, metadata)
        model.fit()

        # test prediction
        exp = pd.DataFrame({
            'y1': self.r1_.predict(),
            'y2': self.r2_.predict()
        },
                           index=self.Y.index)
        res = model.predict()

        pdt.assert_frame_equal(res, exp)

    def test_ols_test(self):

        model = ols('x1 + x2', self.Y, self.X)
        model.fit()

        # test pvalues
        exp = pd.DataFrame({'y1': self.r1_.pvalues, 'y2': self.r2_.pvalues})
        pdt.assert_frame_equal(model.pvalues, exp)

        # test coefficients
        exp = pd.DataFrame({'y1': self.r1_.params, 'y2': self.r2_.params})
        res = model.coefficients()
        pdt.assert_frame_equal(res, exp)

        # test residuals
        exp = pd.DataFrame({
            'y1': self.r1_.resid,
            'y2': self.r2_.resid
        },
                           index=self.Y.index)
        res = model.residuals()
        pdt.assert_frame_equal(res, exp)

        # test prediction
        exp = pd.DataFrame({
            'y1': self.r1_.predict(),
            'y2': self.r2_.predict()
        },
                           index=self.Y.index)
        res = model.predict()
        pdt.assert_frame_equal(res, exp)

        # make a small prediction
        fx = pd.DataFrame([[1, 1, 1], [1, 1, 2]],
                          columns=['Intercept', 'x1', 'x2'],
                          index=['f1', 'f2'])

        rp1 = self.r1_.predict([[1, 1, 1], [1, 1, 2]])
        rp2 = self.r2_.predict([[1, 1, 1], [1, 1, 2]])
        exp = pd.DataFrame({'y1': rp1, 'y2': rp2}, index=['f1', 'f2'])

        res = model.predict(X=fx)
        pdt.assert_frame_equal(res, exp)

        # test r2
        self.assertAlmostEqual(model.r2, 0.21981627865598752)

    def test_ols_ilr_inv_test(self):

        model = ols('x1 + x2', self.Y, self.X)
        model.fit()
        basis, _ = balance_basis(self.tree)
        # test pvalues
        exp = pd.DataFrame({'y1': self.r1_.pvalues, 'y2': self.r2_.pvalues})
        pdt.assert_frame_equal(model.pvalues, exp)

        # test coefficients
        exp = pd.DataFrame({'y1': self.r1_.params, 'y2': self.r2_.params})

        exp = pd.DataFrame(ilr_inv(exp, basis),
                           columns=['c', 'b', 'a'],
                           index=self.X.columns)

        res = model.coefficients(tree=self.tree)
        pdt.assert_frame_equal(res, exp)

        # test residuals
        exp = pd.DataFrame({
            'y1': self.r1_.resid,
            'y2': self.r2_.resid
        },
                           index=self.Y.index)
        exp = pd.DataFrame(ilr_inv(exp, basis),
                           index=self.Y.index,
                           columns=['c', 'b', 'a'])
        res = model.residuals(tree=self.tree)
        pdt.assert_frame_equal(res, exp)

        # test prediction
        exp = pd.DataFrame({
            'y1': self.r1_.predict(),
            'y2': self.r2_.predict()
        },
                           index=self.Y.index)
        exp = pd.DataFrame(ilr_inv(exp, basis),
                           index=self.Y.index,
                           columns=['c', 'b', 'a'])
        res = model.predict(tree=self.tree)
        pdt.assert_frame_equal(res, exp)

    def test_tvalues(self):
        model = ols('x1 + x2', self.Y, self.X)
        model.fit()

        exp = pd.DataFrame({'y1': self.r1_.tvalues, 'y2': self.r2_.tvalues})
        pdt.assert_frame_equal(model.tvalues, exp)

    def test_mse(self):
        model = ols('x1 + x2', self.Y, self.X)
        model.fit()

        exp = pd.Series({'y1': self.r1_.mse_resid, 'y2': self.r2_.mse_resid})
        pdt.assert_series_equal(model.mse, exp)

    def test_ess(self):
        model = ols('x1 + x2', self.Y, self.X)
        model.fit()

        exp = pd.Series({'y1': self.r1_.ess, 'y2': self.r2_.ess})
        pdt.assert_series_equal(model.ess, exp)

    def test_loo(self):
        model = ols('x1 + x2', self.Y, self.X)
        model.fit()
        res = model.loo()
        exp = pd.read_csv(get_data_path('loo.csv'), index_col=0)
        pdt.assert_frame_equal(res, exp)

    def test_kfold(self):
        model = ols('x1 + x2', self.Y, self.X)
        model.fit()
        res = model.kfold(9)
        exp = pd.read_csv(get_data_path('kfold.csv'), index_col=0)
        pdt.assert_frame_equal(res, exp)

    def test_lovo(self):
        model = ols('x1 + x2', self.Y, self.X)
        model.fit()
        res = model.lovo()
        exp = pd.read_csv(get_data_path('lovo.csv'), index_col=0)
        pdt.assert_frame_equal(res, exp)
コード例 #10
0
x = np.vstack([dff_e[3], dff[43]]).T
dta = lagmat2ds(x, 1, trim='both', dropex=1)
            dtaown = add_constant(dta[:, 1:(1 + 1)], prepend=False)
            dtajoint = add_constant(dta[:, 1:], prepend=False)
dtaother = add_constant(dta[:, 2], prepend=False)
dtaother.shape
        res2down = OLS(dta[:, 0], dtaown).fit()
        res2djoint = OLS(dta[:, 0], dtajoint).fit()
from statsmodels.regression.linear_model import OLS, yule_walker
        res2down = OLS(dta[:, 0], dtaown).fit()
        res2djoint = OLS(dta[:, 0], dtajoint).fit()
res2dother = OLS(dta[:, 0], dtaother).fit()
res2dother.ssr
res2dother.params
res2down.ssr
plt.plot(dta[:, 0]);plt.plot(res2dother.predict(res2dother.params, dtaother))
res2dother.params
dta[:3]
plt.plot(dta[:, 0]);plt.plot(res2dother.params[0] * dta[:, 2])
plt.plot(dta[:, 0]);plt.plot(5 * dta[:, 2])
np.sum(np.square(dta[:, 0] - 5 * dta[:, 2]))
np.sum(np.square(dta[:, 0] - 5 * dta[:, 2] - res2dother.params[1]))
np.sum(np.square(dta[:, 0] - 1.568331 * dta[:, 2] - res2dother.params[1]))
plt.plot(dta[:, 0]);plt.plot(5 * dta[:, 2])
plt.plot(dta[:, 0]);plt.plot(dta[:, 2]**2)
plt.plot(dta[:, 0]);plt.plot(dta[:, 2]**2 * 10)
plt.plot(dta[:, 0]);plt.plot(dta[:, 2]**2 * 13)
np.sum(np.square(dta[:, 0] - dta[:, 2] ** 2 * 13 - res2dother.params[1]))
np.sum(np.square(dta[:, 0] - dta[:, 2] ** 2 * 13))
np.sum(np.square(dta[:, 0] - dta[:, 2] ** 2 * 13- dta[:, 2]))
np.sum(np.square(dta[:, 0] - dta[:, 2] ** 2 * 13- 1.56* dta[:, 2]))
コード例 #11
0
ファイル: births.py プロジェクト: ljbelenky/time-series
    ss = SS()

    X = ss.fit_transform(births[columns].values.reshape(-1, len(columns)))
    X = add_constant(X)

    future_X = add_constant(ss.transform(future[columns]))

    # X = SS().fit_transform(births[columns].values.reshape(-1, len(columns)))

    # model1 = LR().fit(births[columns].values.reshape(-1,len(columns)), births['num_births'])
    model2 = OLS(births['num_births'], X).fit()

    bics.append(model2.bic)
    aics.append(model2.aic)

    plt.plot(births.index, model2.predict(X), label=f'{i}:{model2.bic:.3f}')

plt.plot(future.index,
         model2.predict(future_X),
         marker='*',
         color='k',
         linestyle='-')

plt.legend()
plt.plot(original['num_births'], marker='*', linestyle='')
plt.show()

plt.plot(range(2, upper), aics)
plt.plot(range(2, upper), bics)
plt.axvline(2 + np.argmin(bics))
plt.axvline(2 + np.argmin(aics))
コード例 #12
0
 def ols(dep_vb, indep_vbs):
     model = OLS(dep_vb, indep_vbs).fit()
     prediction = model.predict()
     residuals = dep_vb - prediction
     return(model.params, residuals, prediction)
コード例 #13
0
def ols(X, y):
    model = OLS(y, X).fit()
    prediction = model.predict()
    residuals = y - prediction
    return(model.params, residuals, prediction)
コード例 #14
0
def statsmodel_regression(X_train, X_test, y_train, y_test):
    "Similar with above however uses statsmodel'
    regr = OLS(y_train, add_constant(X_train)).fit()
    predictions = regr.predict(X_test)
    r2_test = round(r2_score(y_test, predictions),2)
    return r2_test, regr
コード例 #15
0
from sklearn.linear_model import LogisticRegression, LinearRegression, ElasticNet, Ridge
from statsmodels.discrete.discrete_model import MNLogit
from statsmodels.regression.linear_model import OLS
# we use the full df for the regression because we want to weight results by the
# existence of different ads in different neighborhoods, not just unique addresses
X = df[["black_proportion","log_income","asian_proportion","latinx_proportion","log_price"]]
y = df.white_proportion
df_tmp = df.copy()
df_tmp[list(range(30))] = df_tmp[list(range(30))].where(df_tmp[list(range(30))]>.1,0)
topic_0 + topic_7 + topic_8 + topic_9 + topic_12  + topic_14 + topic_16 + topic_17+ topic_20 + topic_23 + topic_24 + topic_25  + topic_28
X = df[[str(x) for x in [0,7,8,9,12,14,16,17,20,23,24,25,28]]+["black_proportion","log_income","log_price","total_RE"]]
y = np.where(df['white_proportion']>np.median(df['white_proportion']),1,0)
y= df['income']
OLR = OLS(y,X).fit()
OLR.summary()
OLR.predict(exog=X)

df_full_results.params.sort_values()
df_results.params.sort_values()
df_results.summary()
EN = ElasticNet(alpha = .02, l1_ratio=.001)
EN.fit(X,y)
EN.score(X,y)
EN.predict(X)
LinR = LinearRegression()
LinR.fit(X,y)
LinR.score(X,y)

RR = Ridge()
RR.fit(X,y).score(X,y)
pd.Series(RR.coef_)