コード例 #1
0
def test_f_score_with_covars_and_normalized_design_withcovar(random_state=0):
    """

    This test has a statsmodels dependance. There seems to be no simple,
    alternative way to perform a F-test on a linear model including
    covariates.

    """
    try:
        from statsmodels.regression.linear_model import OLS
    except:
        warnings.warn("Statsmodels is required to run this test")
        raise nose.SkipTest

    rng = check_random_state(random_state)

    ### Normalized data
    n_samples = 50
    # generate data
    var1 = np.ones((n_samples, 1)) / np.sqrt(n_samples)  # normalized
    var2 = rng.randn(n_samples, 1)
    var2 = var2 / np.sqrt(np.sum(var2 ** 2, 0))  # normalize
    covars = np.eye(n_samples, 3)  # covars is orthogonal
    covars[3] = -1  # covars is orthogonal to var1
    covars = orthonormalize_matrix(covars)
    # own f_score
    f_val_own = _f_score_with_covars_and_normalized_design(var1, var2, covars)[0]
    # statsmodels f_score
    test_matrix = np.array([[1.0, 0.0, 0.0, 0.0]])
    statsmodels_ols = OLS(var2, np.hstack((var1, covars))).fit()
    f_val_statsmodels = statsmodels_ols.f_test(test_matrix).fvalue[0]
    assert_array_almost_equal(f_val_own, f_val_statsmodels)
コード例 #2
0
    def test_regularized_weights(self):

        np.random.seed(1432)
        exog1 = np.random.normal(size=(100, 3))
        endog1 = exog1[:, 0] + exog1[:, 1] + np.random.normal(size=100)
        exog2 = np.random.normal(size=(100, 3))
        endog2 = exog2[:, 0] + exog2[:, 1] + np.random.normal(size=100)

        exog_a = np.vstack((exog1, exog1, exog2))
        endog_a = np.concatenate((endog1, endog1, endog2))

        # Should be equivalent to exog_a, endog_a.
        exog_b = np.vstack((exog1, exog2))
        endog_b = np.concatenate((endog1, endog2))
        wgts = np.ones(200)
        wgts[0:100] = 2
        sigma = np.diag(1/wgts)

        for L1_wt in 0, 0.5, 1:
            for alpha in 0, 1:
                mod1 = OLS(endog_a, exog_a)
                rslt1 = mod1.fit_regularized(L1_wt=L1_wt, alpha=alpha)

                mod2 = WLS(endog_b, exog_b, weights=wgts)
                rslt2 = mod2.fit_regularized(L1_wt=L1_wt, alpha=alpha)

                mod3 = GLS(endog_b, exog_b, sigma=sigma)
                rslt3 = mod3.fit_regularized(L1_wt=L1_wt, alpha=alpha)

                assert_almost_equal(rslt1.params, rslt2.params, decimal=3)
                assert_almost_equal(rslt1.params, rslt3.params, decimal=3)
コード例 #3
0
ファイル: mass_agg.py プロジェクト: zpace/stellarmass_pca
def fit_dlogM_mw(tab, sfrsd_tab, mltype='ring', mlb='i'):
    merge_tab = t.join(tab, sfrsd_tab, 'plateifu')
    is_agn = m.mask_from_maskbits(merge_tab['mngtarg3'], [1, 2, 3, 4])

    mlb_ix = totalmass.StellarMass.bands_ixs[mlb]
    absmag_sun_mlb = totalmass.StellarMass.absmag_sun[mlb_ix]

    logmass_in_ifu = merge_tab['mass_in_ifu'].to(u.dex(u.Msun))
    logmass_in_ifu_lw = merge_tab['ml_fluxwt'] + merge_tab['ifu_absmag'][:, mlb_ix].to(
        u.dex(m.bandpass_sol_l_unit), totalmass.bandpass_flux_to_solarunits(absmag_sun_mlb))
    merge_tab['dlogmass_lw'] = logmass_in_ifu - logmass_in_ifu_lw
    ha_corr = np.exp(merge_tab['mean_atten_mwtd'] * (6563 / 5500)**-1.3)
    sfrsd = merge_tab['sigma_sfr'] * ha_corr * u.Msun / u.yr / u.pc**2
    mass_pca = merge_tab['mass_in_ifu'] + merge_tab['outer_mass_{}'.format(mltype)]
    ssfrsd = sfrsd / mass_pca
    merge_tab['log_ssfrsd'] = ssfrsd.to(u.dex(ssfrsd.unit))
    merge_tab['log_ssfrsd'][~np.isfinite(merge_tab['log_ssfrsd'])] = np.nan * merge_tab['log_ssfrsd'].unit

    ols = OLS(
        endog=np.array(merge_tab['dlogmass_lw'][~is_agn]),
        exog=sm_add_constant(
            t.Table(merge_tab['mean_atten_mwtd', 'std_atten_mwtd', 'log_ssfrsd'])[~is_agn].to_pandas(),
            prepend=False),
        hasconst=True, missing='drop')

    olsfit = ols.fit()

    return olsfit
コード例 #4
0
ファイル: test_table.py プロジェクト: statsmodels/statsmodels
    def test_regression_with_tuples(self):
        i = pandas.Series([1, 2, 3, 4] * 10, name="i")
        y = pandas.Series([1, 2, 3, 4, 5] * 8, name="y")
        x = pandas.Series([1, 2, 3, 4, 5, 6, 7, 8] * 5, name="x")

        df = pandas.DataFrame(index=i.index)
        df = df.join(i)
        endo = df.join(y)
        exo = df.join(x)
        endo_groups = endo.groupby("i")
        exo_groups = exo.groupby("i")
        exo_df = exo_groups.agg([np.sum, np.max])
        endo_df = endo_groups.agg([np.sum, np.max])
        reg = OLS(exo_df[[("x", "sum")]], endo_df).fit()
        interesting_lines = []
        import warnings
        with warnings.catch_warnings():
            # Catch ominormal warning, not interesting here
            warnings.simplefilter("ignore")
            for line in str(reg.summary()).splitlines():
                if "_" in line:
                    interesting_lines.append(line[:38])

        desired = ["Dep. Variable:                  x_sum ",
                   "y_sum          1.4595      0.209      ",
                   "y_amax         0.2432      0.035      "]

        assert_equal(sorted(desired), sorted(interesting_lines))
コード例 #5
0
def reset_ramsey(res, degree=5):
    '''Ramsey's RESET specification test for linear models

    This is a general specification test, for additional non-linear effects
    in a model.


    Notes
    -----
    The test fits an auxiliary OLS regression where the design matrix, exog,
    is augmented by powers 2 to degree of the fitted values. Then it performs
    an F-test whether these additional terms are significant.

    If the p-value of the f-test is below a threshold, e.g. 0.1, then this
    indicates that there might be additional non-linear effects in the model
    and that the linear model is mis-specified.


    References
    ----------
    http://en.wikipedia.org/wiki/Ramsey_RESET_test

    '''
    order = degree + 1
    k_vars = res.model.exog.shape[1]
    #vander without constant and x:
    y_fitted_vander = np.vander(res.fittedvalues, order)[:, :-2] #drop constant
    exog = np.column_stack((res.model.exog, y_fitted_vander))
    res_aux = OLS(res.model.endog, exog).fit()
    #r_matrix = np.eye(degree, exog.shape[1], k_vars)
    r_matrix = np.eye(degree-1, exog.shape[1], k_vars)
    #df1 = degree - 1
    #df2 = exog.shape[0] - degree - res.df_model  (without constant)
    return res_aux.f_test(r_matrix) #, r_matrix, res_aux
コード例 #6
0
 def setupClass(cls):
     R = np.zeros(7)
     R[4:6] = [1,-1]
     data = longley.load()
     data.exog = add_constant(data.exog, prepend=False)
     res1 = OLS(data.endog, data.exog).fit()
     cls.Ttest1 = res1.t_test(R)
コード例 #7
0
    def test_regularized(self):

        import os
        from . import glmnet_r_results

        cur_dir = os.path.dirname(os.path.abspath(__file__))
        data = np.loadtxt(os.path.join(cur_dir, "results", "lasso_data.csv"),
                          delimiter=",")

        tests = [x for x in dir(glmnet_r_results) if x.startswith("rslt_")]

        for test in tests:

            vec = getattr(glmnet_r_results, test)

            n = vec[0]
            p = vec[1]
            L1_wt = float(vec[2])
            lam = float(vec[3])
            params = vec[4:].astype(np.float64)

            endog = data[0:int(n), 0]
            exog = data[0:int(n), 1:(int(p)+1)]

            endog = endog - endog.mean()
            endog /= endog.std(ddof=1)
            exog = exog - exog.mean(0)
            exog /= exog.std(0, ddof=1)

            mod = OLS(endog, exog)
            rslt = mod.fit_regularized(L1_wt=L1_wt, alpha=lam)
            assert_almost_equal(rslt.params, params, decimal=3)

            # Smoke test for summary
            smry = rslt.summary()
コード例 #8
0
def test_permuted_ols_intercept_statsmodels_withcovar(random_state=0):
    """

    This test has a statsmodels dependance. There seems to be no simple,
    alternative way to perform a F-test on a linear model including
    covariates.

    """
    try:
        from statsmodels.regression.linear_model import OLS
    except:
        warnings.warn("Statsmodels is required to run this test")
        raise nose.SkipTest

    rng = check_random_state(random_state)
    # design parameters
    n_samples = 50
    # create design
    target_var = rng.randn(n_samples, 1)
    tested_var = np.ones((n_samples, 1))
    confounding_vars = rng.randn(n_samples, 2)
    # statsmodels OLS
    ols = OLS(target_var, np.hstack((tested_var, confounding_vars))).fit()
    fvals = ols.f_test([[1.0, 0.0, 0.0]]).fvalue
    # permuted OLS
    _, orig_scores, _ = permuted_ols(tested_var, target_var, confounding_vars, n_perm=0, random_state=random_state)
    # same thing but with model_intercept=True to check it has no effect
    _, orig_scores_addintercept, _ = permuted_ols(
        tested_var, target_var, confounding_vars, model_intercept=True, n_perm=0, random_state=random_state
    )
    assert_array_almost_equal(fvals, orig_scores, decimal=6)
    assert_array_almost_equal(orig_scores, orig_scores_addintercept, decimal=6)
コード例 #9
0
def test_repeat_partition():

    # tests that if we use identical partitions the average is the same
    # as the estimate for the full data

    np.random.seed(435265)
    N = 200
    p = 10
    m = 1

    beta = np.random.normal(size=p)
    beta = beta * np.random.randint(0, 2, p)
    X = np.random.normal(size=(N, p))
    y = X.dot(beta) + np.random.normal(size=N)

    def _rep_data_gen(endog, exog, partitions):
        """partitions data"""

        n_exog = exog.shape[0]
        n_part = np.ceil(n_exog / partitions)

        ii = 0
        while ii < n_exog:
            yield endog, exog
            ii += int(n_part)

    nv_mod = DistributedModel(m, estimation_method=_est_regularized_naive,
                              join_method=_join_naive)
    fitOLSnv = nv_mod.fit(_rep_data_gen(y, X, m), fit_kwds={"alpha": 0.1})

    ols_mod = OLS(y, X)
    fitOLS = ols_mod.fit_regularized(alpha=0.1)

    assert_allclose(fitOLSnv.params, fitOLS.params)
コード例 #10
0
    def setup_class(cls):
        cls.cov_type = 'cluster'

        mod1 = GLM(endog, exog, family=families.Gaussian())
        cls.res1 = mod1.fit(cov_type='cluster', cov_kwds=dict(groups=group))

        mod2 = OLS(endog, exog)
        cls.res2 = mod2.fit(cov_type='cluster', cov_kwds=dict(groups=group))
コード例 #11
0
    def setup_class(cls):
        cls.cov_type = 'HC0'

        mod1 = GLM(endog, exog, family=families.Gaussian())
        cls.res1 = mod1.fit(cov_type='HC0')

        mod2 = OLS(endog, exog)
        cls.res2 = mod2.fit(cov_type='HC0')
コード例 #12
0
 def setupClass(cls):
     data = longley.load()
     data.exog = add_constant(data.exog, prepend=False)
     res1 = OLS(data.endog, data.exog).fit()
     R2 = [[0,1,-1,0,0,0,0],[0, 0, 0, 0, 1, -1, 0]]
     cls.Ftest1 = res1.f_test(R2)
     hyp = 'x2 = x3, x5 = x6'
     cls.NewFtest1 = res1.f_test(hyp)
コード例 #13
0
def test_filter():
    # Basic test for filtering
    mod = RecursiveLS(endog, exog)
    res = mod.filter()

    # Test the RLS estimates against OLS estimates
    mod_ols = OLS(endog, exog)
    res_ols = mod_ols.fit()
    assert_allclose(res.params, res_ols.params)
コード例 #14
0
ファイル: gofplots.py プロジェクト: SuperXrooT/statsmodels
def qqline(ax, line, x=None, y=None, dist=None, fmt='r-'):
    """
    Plot a reference line for a qqplot.

    Parameters
    ----------
    ax : matplotlib axes instance
        The axes on which to plot the line
    line : str {'45','r','s','q'}
        Options for the reference line to which the data is compared.:

        - '45' - 45-degree line
        - 's'  - standardized line, the expected order statistics are scaled by
                 the standard deviation of the given sample and have the mean
                 added to them
        - 'r'  - A regression line is fit
        - 'q'  - A line is fit through the quartiles.
        - None - By default no reference line is added to the plot.

    x : array
        X data for plot. Not needed if line is '45'.
    y : array
        Y data for plot. Not needed if line is '45'.
    dist : scipy.stats.distribution
        A scipy.stats distribution, needed if line is 'q'.

    Notes
    -----
    There is no return value. The line is plotted on the given `ax`.
    """
    if line == '45':
        end_pts = zip(ax.get_xlim(), ax.get_ylim())
        end_pts[0] = min(end_pts[0])
        end_pts[1] = max(end_pts[1])
        ax.plot(end_pts, end_pts, fmt)
        ax.set_xlim(end_pts)
        ax.set_ylim(end_pts)
        return # does this have any side effects?
    if x is None and y is None:
        raise ValueError("If line is not 45, x and y cannot be None.")
    elif line == 'r':
        # could use ax.lines[0].get_xdata(), get_ydata(),
        # but don't know axes are 'clean'
        y = OLS(y, add_constant(x)).fit().fittedvalues
        ax.plot(x,y,fmt)
    elif line == 's':
        m,b = y.std(), y.mean()
        ref_line = x*m + b
        ax.plot(x, ref_line, fmt)
    elif line == 'q':
        _check_for_ppf(dist)
        q25 = stats.scoreatpercentile(y, 25)
        q75 = stats.scoreatpercentile(y, 75)
        theoretical_quartiles = dist.ppf([0.25, 0.75])
        m = (q75 - q25) / np.diff(theoretical_quartiles)
        b = q25 - m*theoretical_quartiles[0]
        ax.plot(x, m*x + b, fmt)
コード例 #15
0
def test_conf_int_single_regressor():
    # GH#706 single-regressor model (i.e. no intercept) with 1D exog
    # should get passed to DataFrame for conf_int
    y = pandas.Series(np.random.randn(10))
    x = pandas.Series(np.ones(10))
    res = OLS(y, x).fit()
    conf_int = res.conf_int()
    np.testing.assert_equal(conf_int.shape, (1, 2))
    np.testing.assert_(isinstance(conf_int, pandas.DataFrame))
コード例 #16
0
def test_706():
    # make sure one regressor pandas Series gets passed to DataFrame
    # for conf_int.
    y = pandas.Series(np.random.randn(10))
    x = pandas.Series(np.ones(10))
    res = OLS(y,x).fit()
    conf_int = res.conf_int()
    np.testing.assert_equal(conf_int.shape, (1, 2))
    np.testing.assert_(isinstance(conf_int, pandas.DataFrame))
コード例 #17
0
def test_summary_as_latex():
    # GH#734
    import re
    dta = longley.load_pandas()
    X = dta.exog
    X["constant"] = 1
    y = dta.endog
    res = OLS(y, X).fit()
    with pytest.warns(UserWarning):
        table = res.summary().as_latex()
    # replace the date and time
    table = re.sub("(?<=\n\\\\textbf\\{Date:\\}             &).+?&",
                   " Sun, 07 Apr 2013 &", table)
    table = re.sub("(?<=\n\\\\textbf\\{Time:\\}             &).+?&",
                   "     13:46:07     &", table)

    expected = """\\begin{center}
\\begin{tabular}{lclc}
\\toprule
\\textbf{Dep. Variable:}    &      TOTEMP      & \\textbf{  R-squared:         } &     0.995   \\\\
\\textbf{Model:}            &       OLS        & \\textbf{  Adj. R-squared:    } &     0.992   \\\\
\\textbf{Method:}           &  Least Squares   & \\textbf{  F-statistic:       } &     330.3   \\\\
\\textbf{Date:}             & Sun, 07 Apr 2013 & \\textbf{  Prob (F-statistic):} &  4.98e-10   \\\\
\\textbf{Time:}             &     13:46:07     & \\textbf{  Log-Likelihood:    } &   -109.62   \\\\
\\textbf{No. Observations:} &          16      & \\textbf{  AIC:               } &     233.2   \\\\
\\textbf{Df Residuals:}     &           9      & \\textbf{  BIC:               } &     238.6   \\\\
\\textbf{Df Model:}         &           6      & \\textbf{                     } &             \\\\
\\bottomrule
\\end{tabular}
\\begin{tabular}{lcccccc}
                  & \\textbf{coef} & \\textbf{std err} & \\textbf{t} & \\textbf{P$> |$t$|$} & \\textbf{[0.025} & \\textbf{0.975]}  \\\\
\\midrule
\\textbf{GNPDEFL}  &      15.0619  &       84.915     &     0.177  &         0.863        &     -177.029    &      207.153     \\\\
\\textbf{GNP}      &      -0.0358  &        0.033     &    -1.070  &         0.313        &       -0.112    &        0.040     \\\\
\\textbf{UNEMP}    &      -2.0202  &        0.488     &    -4.136  &         0.003        &       -3.125    &       -0.915     \\\\
\\textbf{ARMED}    &      -1.0332  &        0.214     &    -4.822  &         0.001        &       -1.518    &       -0.549     \\\\
\\textbf{POP}      &      -0.0511  &        0.226     &    -0.226  &         0.826        &       -0.563    &        0.460     \\\\
\\textbf{YEAR}     &    1829.1515  &      455.478     &     4.016  &         0.003        &      798.788    &     2859.515     \\\\
\\textbf{constant} &   -3.482e+06  &      8.9e+05     &    -3.911  &         0.004        &     -5.5e+06    &    -1.47e+06     \\\\
\\bottomrule
\\end{tabular}
\\begin{tabular}{lclc}
\\textbf{Omnibus:}       &  0.749 & \\textbf{  Durbin-Watson:     } &    2.559  \\\\
\\textbf{Prob(Omnibus):} &  0.688 & \\textbf{  Jarque-Bera (JB):  } &    0.684  \\\\
\\textbf{Skew:}          &  0.420 & \\textbf{  Prob(JB):          } &    0.710  \\\\
\\textbf{Kurtosis:}      &  2.434 & \\textbf{  Cond. No.          } & 4.86e+09  \\\\
\\bottomrule
\\end{tabular}
%\\caption{OLS Regression Results}
\\end{center}

Warnings: \\newline
 [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. \\newline
 [2] The condition number is large, 4.86e+09. This might indicate that there are \\newline
 strong multicollinearity or other numerical problems."""
    assert_equal(table, expected)
コード例 #18
0
    def setup_class(cls):

        cls.cov_type = 'HAC'

        kwds={'maxlags':2}
        mod1 = GLM(endog, exog, family=families.Gaussian())
        cls.res1 = mod1.fit(cov_type='HAC', cov_kwds=kwds)

        mod2 = OLS(endog, exog)
        cls.res2 = mod2.fit(cov_type='HAC', cov_kwds=kwds)
コード例 #19
0
ファイル: test_regression.py プロジェクト: 5267/statsmodels
    def test_empty_model(self):

       np.random.seed(742)
       n = 100
       endog = np.random.normal(size=n)
       exog = np.random.normal(size=(n, 3))

       model = OLS(endog, exog)
       result = model.fit_regularized(alpha=1000)

       assert_equal(result.params, 0.)
コード例 #20
0
ファイル: test_diagnostic.py プロジェクト: AnaMP/statsmodels
def test_outlier_influence_funcs():
    # smoke test
    x = add_constant(np.random.randn(10, 2))
    y = x.sum(1) + np.random.randn(10)
    res = OLS(y, x).fit()
    oi.summary_table(res, alpha=0.05)

    res2 = OLS(y, x[:, 0]).fit()
    oi.summary_table(res2, alpha=0.05)
    infl = res2.get_influence()
    infl.summary_table()
コード例 #21
0
 def setupClass(cls):
     data = longley.load()
     data.exog = add_constant(data.exog, prepend=False)
     res1 = OLS(data.endog, data.exog).fit()
     R = np.array([[0,1,1,0,0,0,0],
           [0,1,0,1,0,0,0],
           [0,1,0,0,0,0,0],
           [0,0,0,0,1,0,0],
           [0,0,0,0,0,1,0]])
     q = np.array([0,0,0,1,0])
     cls.Ftest1 = res1.f_test((R,q))
コード例 #22
0
    def setup_class(cls):

        cls.cov_type = 'HAC'

        # check kernel specified as string
        kwds = {'kernel': 'bartlett', 'maxlags': 2}
        mod1 = GLM(endog, exog, family=families.Gaussian())
        cls.res1 = mod1.fit(cov_type='HAC', cov_kwds=kwds)

        mod2 = OLS(endog, exog)
        kwds2 = {'maxlags': 2}
        cls.res2 = mod2.fit(cov_type='HAC', cov_kwds=kwds2)
コード例 #23
0
def test_regularized_refit():
    n = 100
    p = 5
    np.random.seed(3132)
    xmat = np.random.normal(size=(n, p))
    yvec = xmat.sum(1) + np.random.normal(size=n)
    model1 = OLS(yvec, xmat)
    result1 = model1.fit_regularized(alpha=2., L1_wt=0.5, refit=True)
    model2 = OLS(yvec, xmat)
    result2 = model2.fit_regularized(alpha=2., L1_wt=0.5, refit=True)
    assert_allclose(result1.params, result2.params)
    assert_allclose(result1.bse, result2.bse)
コード例 #24
0
    def setup_class(cls):

        cls.cov_type = 'HAC'

        kwds={'kernel': sw.weights_uniform, 'maxlags': 2}
        mod1 = GLM(endog, exog, family=families.Gaussian())
        cls.res1 = mod1.fit(cov_type='HAC', cov_kwds=kwds)

        # check kernel as string
        mod2 = OLS(endog, exog)
        kwds2 = {'kernel': 'uniform', 'maxlags': 2}
        cls.res2 = mod2.fit(cov_type='HAC', cov_kwds=kwds)
コード例 #25
0
def test_regularized_options():
    n = 100
    p = 5
    np.random.seed(3132)
    xmat = np.random.normal(size=(n, p))
    yvec = xmat.sum(1) + np.random.normal(size=n)
    model1 = OLS(yvec - 1, xmat)
    result1 = model1.fit_regularized(alpha=1., L1_wt=0.5)
    model2 = OLS(yvec, xmat, offset=1)
    result2 = model2.fit_regularized(alpha=1., L1_wt=0.5,
                                     start_params=np.zeros(5))
    assert_allclose(result1.params, result2.params)
コード例 #26
0
    def setup_class(cls):

        cls.cov_type = 'HAC'

        kwds={'kernel':sw.weights_uniform, 'maxlags':2}
        mod1 = GLM(endog, exog, family=families.Gaussian())
        cls.res1 = mod1.fit(cov_type='HAC', cov_kwds=kwds)

        mod2 = OLS(endog, exog)
        cls.res2 = mod2.fit(cov_type='HAC', cov_kwds=kwds)

        #for debugging
        cls.res3 = mod2.fit(cov_type='HAC', cov_kwds={'maxlags':2})
コード例 #27
0
def test_summary():
    # test 734
    import re
    dta = longley.load_pandas()
    X = dta.exog
    X["constant"] = 1
    y = dta.endog
    with warnings.catch_warnings(record=True):
        res = OLS(y, X).fit()
        table = res.summary().as_latex()
    # replace the date and time
    table = re.sub("(?<=\n\\\\textbf\{Date:\}             &).+?&",
                   " Sun, 07 Apr 2013 &", table)
    table = re.sub("(?<=\n\\\\textbf\{Time:\}             &).+?&",
                   "     13:46:07     &", table)

    expected = """\\begin{center}
\\begin{tabular}{lclc}
\\toprule
\\textbf{Dep. Variable:}    &      TOTEMP      & \\textbf{  R-squared:         } &     0.995   \\\\
\\textbf{Model:}            &       OLS        & \\textbf{  Adj. R-squared:    } &     0.992   \\\\
\\textbf{Method:}           &  Least Squares   & \\textbf{  F-statistic:       } &     330.3   \\\\
\\textbf{Date:}             & Sun, 07 Apr 2013 & \\textbf{  Prob (F-statistic):} &  4.98e-10   \\\\
\\textbf{Time:}             &     13:46:07     & \\textbf{  Log-Likelihood:    } &   -109.62   \\\\
\\textbf{No. Observations:} &          16      & \\textbf{  AIC:               } &     233.2   \\\\
\\textbf{Df Residuals:}     &           9      & \\textbf{  BIC:               } &     238.6   \\\\
\\textbf{Df Model:}         &           6      & \\textbf{                     } &             \\\\
\\bottomrule
\\end{tabular}
\\begin{tabular}{lccccc}
                  & \\textbf{coef} & \\textbf{std err} & \\textbf{t} & \\textbf{P$>$$|$t$|$} & \\textbf{[95.0\\% Conf. Int.]}  \\\\
\\midrule
\\textbf{GNPDEFL}  &      15.0619  &       84.915     &     0.177  &         0.863        &      -177.029   207.153       \\\\
\\textbf{GNP}      &      -0.0358  &        0.033     &    -1.070  &         0.313        &        -0.112     0.040       \\\\
\\textbf{UNEMP}    &      -2.0202  &        0.488     &    -4.136  &         0.003        &        -3.125    -0.915       \\\\
\\textbf{ARMED}    &      -1.0332  &        0.214     &    -4.822  &         0.001        &        -1.518    -0.549       \\\\
\\textbf{POP}      &      -0.0511  &        0.226     &    -0.226  &         0.826        &        -0.563     0.460       \\\\
\\textbf{YEAR}     &    1829.1515  &      455.478     &     4.016  &         0.003        &       798.788  2859.515       \\\\
\\textbf{constant} &   -3.482e+06  &      8.9e+05     &    -3.911  &         0.004        &      -5.5e+06 -1.47e+06       \\\\
\\bottomrule
\\end{tabular}
\\begin{tabular}{lclc}
\\textbf{Omnibus:}       &  0.749 & \\textbf{  Durbin-Watson:     } &    2.559  \\\\
\\textbf{Prob(Omnibus):} &  0.688 & \\textbf{  Jarque-Bera (JB):  } &    0.684  \\\\
\\textbf{Skew:}          &  0.420 & \\textbf{  Prob(JB):          } &    0.710  \\\\
\\textbf{Kurtosis:}      &  2.434 & \\textbf{  Cond. No.          } & 4.86e+09  \\\\
\\bottomrule
\\end{tabular}
%\\caption{OLS Regression Results}
\\end{center}"""
    assert_equal(table, expected)
コード例 #28
0
    def setup_class(cls):
        cls.cov_type = 'hac-groupsum'
        # time index is just made up to have a test case
        time = np.tile(np.arange(7), 5)[:-1]
        mod1 = GLM(endog, exog, family=families.Gaussian())
        kwds = dict(time=pd.Series(time),  # check for #3606
                    maxlags=2,
                    use_correction='hac',
                    df_correction=False)
        cls.res1 = mod1.fit(cov_type='hac-groupsum', cov_kwds=kwds)
        cls.res1b = mod1.fit(cov_type='nw-groupsum', cov_kwds=kwds)

        mod2 = OLS(endog, exog)
        cls.res2 = mod2.fit(cov_type='hac-groupsum', cov_kwds=kwds)
コード例 #29
0
def test_regularized_refit():
    n = 100
    p = 5
    np.random.seed(3132)
    xmat = np.random.normal(size=(n, p))
    # covariates 0 and 2 matter
    yvec = xmat[:, 0] + xmat[:, 2] + np.random.normal(size=n)
    model1 = OLS(yvec, xmat)
    result1 = model1.fit_regularized(alpha=2., L1_wt=0.5, refit=True)
    model2 = OLS(yvec, xmat[:, [0, 2]])
    result2 = model2.fit()
    ii = [0, 2]
    assert_allclose(result1.params[ii], result2.params)
    assert_allclose(result1.bse[ii], result2.bse)
コード例 #30
0
    def setup_class(cls):
        cls.cov_type = 'hac-panel'
        # time index is just made up to have a test case
        groups = np.repeat(np.arange(5), 7)[:-1]
        mod1 = GLM(endog.copy(), exog.copy(), family=families.Gaussian())
        kwds = dict(groups=pd.Series(groups),  # check for #3606
                    maxlags=2,
                    kernel=sw.weights_uniform,
                    use_correction='hac',
                    df_correction=False)
        cls.res1 = mod1.fit(cov_type='hac-panel', cov_kwds=kwds)

        mod2 = OLS(endog, exog)
        cls.res2 = mod2.fit(cov_type='hac-panel', cov_kwds=kwds)
コード例 #31
0
ファイル: TradingStrategy.py プロジェクト: anchalsri82/Python
print sm_res_adf
print my_res_adf['adfstat']
print "%0.4f" % my_res_adf['adfstat']

# ===== STABILITY CHECK  =====
print key, np.abs(my_res_adf['roots'])
print "passes stability check: {0}".format(is_stable(my_res_adf['roots']))

from statsmodels.regression.linear_model import OLS

Y = y.diff()[1:]  # must remove first element from array which is nan
X = pd.concat([x.diff()[1:], e_t_hat.shift(1)[1:]], axis=1)
X_c = add_constant(X)

sm_res_ecm = OLS(Y, X).fit()  # fit without constant
sm_res_ecm_c = OLS(Y, X_c).fit()  # fit without constant

sm_res_ecm_c.summary2()
sm_res_ecm.summary2()

# ======  FIT TO OU PROCESS  ======

# My implementations
from analysis import my_AR  # AR(p) model

# Import statsmodels equivalents to validate results
from statsmodels.tsa.ar_model import AR

# Run AR(1) model with constant term with e_t_hat as endogenous variable
my_res_ar = my_AR(endog=e_t_hat, maxlag=1, trend='c')
コード例 #32
0
        2000,
    ))
    T = T[::-1]

    # For percentiles 1, 5 and 10, regress on a constant, and powers of 1/T
    out = []
    for cv in critical_values:
        num_ex = results.shape[2]
        loc = np.where(percentiles == cv)[0][0]
        lhs = np.squeeze(results[loc, :, :])
        # Adjust for effective sample size, this is what lookup the code uses
        tau = np.ones((num_ex, 1)).dot(T[None, :]) - 1.0
        tau = tau.T
        lhs = lhs.ravel()
        tau = tau.ravel()
        tau = tau[:, None]
        n = lhs.shape[0]
        rhs = np.ones((n, 1))
        rhs = np.hstack((rhs, 1.0 / tau))
        rhs = np.hstack((rhs, (1.0 / tau)**2.0))
        rhs = np.hstack((rhs, (1.0 / tau)**3.0))
        res = OLS(lhs, rhs).fit()
        res.params[np.abs(res.tvalues) < 1.96] = 0.0
        out.append(res.params)

    adf_z_cv_approx[t] = np.array(out)

print("from numpy import array")
print("")
print("adf_z_cv_approx = " + str(adf_z_cv_approx))
コード例 #33
0
 def setup_class(cls):
     data = longley.load(as_pandas=False)
     data.exog = add_constant(data.exog, prepend=False)
     cls.res1 = OLS(data.endog, data.exog).fit()
     cls.res2 = WLS(data.endog, data.exog).fit()
コード例 #34
0
def notyet_atst():
    d = macrodata.load().data

    realinv = d['realinv']
    realgdp = d['realgdp']
    realint = d['realint']
    endog = realinv
    exog = add_constant(np.c_[realgdp, realint])
    res_ols1 = OLS(endog, exog).fit()

    #growth rates
    gs_l_realinv = 400 * np.diff(np.log(d['realinv']))
    gs_l_realgdp = 400 * np.diff(np.log(d['realgdp']))
    lint = d['realint'][:-1]
    tbilrate = d['tbilrate'][:-1]

    endogg = gs_l_realinv
    exogg = add_constant(np.c_[gs_l_realgdp, lint])
    exogg2 = add_constant(np.c_[gs_l_realgdp, tbilrate])

    res_ols = OLS(endogg, exogg).fit()
    res_ols2 = OLS(endogg, exogg2).fit()

    #the following were done accidentally with res_ols1 in R,
    #with original Greene data

    params = np.array(
        [-272.3986041341653, 0.1779455206941112, 0.2149432424658157])
    cov_hac_4 = np.array([
        1321.569466333051, -0.2318836566017612, 37.01280466875694,
        -0.2318836566017614, 4.602339488102263e-05, -0.0104687835998635,
        37.012804668757, -0.0104687835998635, 21.16037144168061
    ]).reshape(3, 3, order='F')
    cov_hac_10 = np.array([
        2027.356101193361, -0.3507514463299015, 54.81079621448568,
        -0.350751446329901, 6.953380432635583e-05, -0.01268990195095196,
        54.81079621448564, -0.01268990195095195, 22.92512402151113
    ]).reshape(3, 3, order='F')

    #goldfeld-quandt
    het_gq_greater = dict(statistic=13.20512768685082,
                          df1=99,
                          df2=98,
                          pvalue=1.246141976112324e-30,
                          distr='f')
    het_gq_less = dict(statistic=13.20512768685082, df1=99, df2=98, pvalue=1.)
    het_gq_2sided = dict(statistic=13.20512768685082,
                         df1=99,
                         df2=98,
                         pvalue=1.246141976112324e-30,
                         distr='f')

    #goldfeld-quandt, fraction = 0.5
    het_gq_greater_2 = dict(statistic=87.1328934692124,
                            df1=48,
                            df2=47,
                            pvalue=2.154956842194898e-33,
                            distr='f')

    gq = smsdia.het_goldfeldquandt(endog, exog, split=0.5)
    compare_t_est(gq, het_gq_greater, decimal=(13, 14))
    assert_equal(gq[-1], 'increasing')

    harvey_collier = dict(stat=2.28042114041313,
                          df=199,
                          pvalue=0.02364236161988260,
                          distr='t')
    #hc = harvtest(fm, order.by=ggdp , data = list())
    harvey_collier_2 = dict(stat=0.7516918462158783,
                            df=199,
                            pvalue=0.4531244858006127,
                            distr='t')
コード例 #35
0
def plot_ccpr(results, exog_idx, ax=None):
    """Plot CCPR against one regressor.

    Generates a CCPR (component and component-plus-residual) plot.

    Parameters
    ----------
    results : result instance
        A regression results instance.
    exog_idx : {int, str}
        Exogenous, explanatory variable. If string is given, it should
        be the variable name that you want to use, and you can use arbitrary
        translations as with a formula.
    ax : Matplotlib AxesSubplot instance, optional
        If given, it is used to plot in instead of a new figure being
        created.

    Returns
    -------
    fig : Figure
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.

    See Also
    --------
    plot_ccpr_grid : Creates CCPR plot for multiple regressors in a plot grid.

    Notes
    -----
    The CCPR plot provides a way to judge the effect of one regressor on the
    response variable by taking into account the effects of the other
    independent variables. The partial residuals plot is defined as
    Residuals + B_i*X_i versus X_i. The component adds the B_i*X_i versus
    X_i to show where the fitted line would lie. Care should be taken if X_i
    is highly correlated with any of the other independent variables. If this
    is the case, the variance evident in the plot will be an underestimate of
    the true variance.

    Examples
    --------
    Using the state crime dataset plot the effect of the rate of single
    households ('single') on the murder rate while accounting for high school
    graduation rate ('hs_grad'), percentage of people in an urban area, and rate
    of poverty ('poverty').


    >>> import statsmodels.api as sm
    >>> import matplotlib.pyplot as plot
    >>> import statsmodels.formula.api as smf

    >>> crime_data = sm.datasets.statecrime.load_pandas()
    >>> results = smf.ols('murder ~ hs_grad + urban + poverty + single',
    ...                   data=crime_data.data).fit()
    >>> sm.graphics.plot_ccpr(results, 'single')
    >>> plt.show()

    .. plot:: plots/graphics_regression_ccpr.py

    References
    ----------
    http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/ccpr.htm

    """
    fig, ax = utils.create_mpl_ax(ax)

    exog_name, exog_idx = utils.maybe_name_or_idx(exog_idx, results.model)
    results = maybe_unwrap_results(results)

    x1 = results.model.exog[:, exog_idx]
    #namestr = ' for %s' % self.name if self.name else ''
    x1beta = x1 * results.params[exog_idx]
    ax.plot(x1, x1beta + results.resid, 'o')
    from statsmodels.tools.tools import add_constant
    mod = OLS(x1beta, add_constant(x1)).fit()
    params = mod.params
    fig = abline_plot(*params, **dict(ax=ax))
    #ax.plot(x1, x1beta, '-')
    ax.set_title('Component and component plus residual plot')
    ax.set_ylabel("Residual + %s*beta_%d" % (exog_name, exog_idx))
    ax.set_xlabel("%s" % exog_name)

    return fig
コード例 #36
0
def __slope_ols(x):
    x = x[~np.isnan(x)]
    xs = 2 * (x - min(x)) / (max(x) - min(x)) - 1
    m = OLS(xs, np.vander(np.linspace(-1, 1, len(xs)), 2)).fit()
    return m.params[0]
コード例 #37
0
r_2016 = r_shares.loc[r_shares['Year'] == 2016]

# unify county/city names by capitalizing all
acs['County/City'] = acs['County/City'].str.upper()
r_2016['County/City'] = r_shares['County/City'].str.upper()

# join dataframes
r_acs = pd.merge(acs, r_2016, on='County/City')

sbn.pairplot(
    r_acs,
    vars=['Household Income', 'Medicare Coverage', 'Foreign Born', 'R_SHARE'])
fig2 = plt.gcf()

fig2.savefig('pairplot.png', bbox_inches='tight')

from statsmodels.regression.linear_model import OLS

# define variables for regression
X = r_acs[['Household Income', 'Medicare Coverage', 'Foreign Born']]
X['Intercept'] = 1
y = r_acs['R_SHARE']

# run regression
reg = OLS(y, X)
fit = reg.fit()
fit.summary()

# save coefficients
pd.DataFrame(fit.params).to_csv('coefficients.csv')
コード例 #38
0
def get_tvalue_with_alternative_library(tested_vars, target_vars, covars=None):
    """Utility function to compute tvalues with linalg or statsmodels

    Massively univariate linear model (= each target is considered
    independently).

    Parameters
    ----------
    tested_vars: array-like, shape=(n_samples, n_regressors)
      Tested variates, the associated coefficient of which are to be tested
      independently with a t-test, resulting in as many t-values.

    target_vars: array-like, shape=(n_samples, n_targets)
      Target variates, to be approximated with a linear combination of
      the tested variates and the confounding variates.

    covars: array-like, shape=(n_samples, n_confounds)
      Confounding variates, to be fitted but not to be tested

    Returns
    -------
    t-values: np.ndarray, shape=(n_regressors, n_targets)

    """
    ### set up design
    n_samples, n_regressors = tested_vars.shape
    n_targets = target_vars.shape[1]
    if covars is not None:
        n_covars = covars.shape[1]
        design_matrix = np.hstack((tested_vars, covars))
    else:
        n_covars = 0
        design_matrix = tested_vars
    mask_covars = np.ones(n_regressors + n_covars, dtype=bool)
    mask_covars[:n_regressors] = False
    test_matrix = np.array([[1.] + [0.] * n_covars])

    ### t-values computation
    try:  # try with statsmodels is available (more concise)
        from statsmodels.regression.linear_model import OLS
        t_values = np.empty((n_targets, n_regressors))
        for i in range(n_targets):
            current_target = target_vars[:, i].reshape((-1, 1))
            for j in range(n_regressors):
                current_tested_mask = mask_covars.copy()
                current_tested_mask[j] = True
                current_design_matrix = design_matrix[:, current_tested_mask]
                ols_fit = OLS(current_target, current_design_matrix).fit()
                t_values[i, j] = np.ravel(ols_fit.t_test(test_matrix).tvalue)
    except:  # use linalg if statsmodels is not available
        from numpy import linalg
        lost_dof = n_covars + 1  # fit all tested variates independently
        t_values = np.empty((n_targets, n_regressors))
        for i in range(n_regressors):
            current_tested_mask = mask_covars.copy()
            current_tested_mask[i] = True
            current_design_matrix = design_matrix[:, current_tested_mask]
            invcov = linalg.pinv(current_design_matrix)
            normalized_cov = np.dot(invcov, invcov.T)
            t_val_denom_aux = np.diag(
                np.dot(test_matrix, np.dot(normalized_cov, test_matrix.T)))
            t_val_denom_aux = t_val_denom_aux.reshape((-1, 1))
            for j in range(n_targets):
                current_target = target_vars[:, j].reshape((-1, 1))
                res_lstsq = linalg.lstsq(current_design_matrix, current_target)
                residuals = (current_target -
                             np.dot(current_design_matrix, res_lstsq[0]))
                t_val_num = np.dot(test_matrix, res_lstsq[0])
                t_val_denom = np.sqrt(
                    np.sum(residuals**2, 0) / float(n_samples - lost_dof) *
                    t_val_denom_aux)
                t_values[j, i] = np.ravel(t_val_num / t_val_denom)
    return t_values
コード例 #39
0
 def setupClass(cls):
     data = longley.load()
     data.exog = add_constant(data.exog, prepend=False)
     cls.res1 = OLS(data.endog, data.exog).fit()
     R = np.identity(7)[:-1, :]
     cls.Ftest = cls.res1.f_test(R)
コード例 #40
0
ファイル: _statsmodels.py プロジェクト: shenshan/ibllib
def local_fdr(zscores, null_proportion=1.0, null_pdf=None, deg=7, nbins=30):
    """
    Calculate local FDR values for a list of Z-scores.
    Parameters
    ----------
    zscores : array_like
        A vector of Z-scores
    null_proportion : float
        The assumed proportion of true null hypotheses
    null_pdf : function mapping reals to positive reals
        The density of null Z-scores; if None, use standard normal
    deg : int
        The maximum exponent in the polynomial expansion of the
        density of non-null Z-scores
    nbins : int
        The number of bins for estimating the marginal density
        of Z-scores.
    Returns
    -------
    fdr : array_like
        A vector of FDR values
    References
    ----------
    B Efron (2008).  Microarrays, Empirical Bayes, and the Two-Groups
    Model.  Statistical Science 23:1, 1-22.
    Examples
    --------
    Basic use (the null Z-scores are taken to be standard normal):
    >>> from statsmodels.stats.multitest import local_fdr
    >>> import numpy as np
    >>> zscores = np.random.randn(30)
    >>> fdr = local_fdr(zscores)
    Use a Gaussian null distribution estimated from the data:
    >>> null = EmpiricalNull(zscores)
    >>> fdr = local_fdr(zscores, null_pdf=null.pdf)
    """

    from statsmodels.genmod.generalized_linear_model import GLM
    from statsmodels.genmod.generalized_linear_model import families
    from statsmodels.regression.linear_model import OLS

    # Bins for Poisson modeling of the marginal Z-score density
    minz = min(zscores)
    maxz = max(zscores)
    bins = np.linspace(minz, maxz, nbins)

    # Bin counts
    zhist = np.histogram(zscores, bins)[0]

    # Bin centers
    zbins = (bins[:-1] + bins[1:]) / 2

    # The design matrix at bin centers
    dmat = np.vander(zbins, deg + 1)

    # Use this to get starting values for Poisson regression
    md = OLS(np.log(1 + zhist), dmat).fit()

    # Poisson regression
    md = GLM(zhist, dmat,
             family=families.Poisson()).fit(start_params=md.params)

    # The design matrix for all Z-scores
    dmat_full = np.vander(zscores, deg + 1)

    # The height of the estimated marginal density of Z-scores,
    # evaluated at every observed Z-score.
    fz = md.predict(dmat_full) / (len(zscores) * (bins[1] - bins[0]))

    # The null density.
    if null_pdf is None:
        f0 = np.exp(-0.5 * zscores**2) / np.sqrt(2 * np.pi)
    else:
        f0 = null_pdf(zscores)

    # The local FDR values
    fdr = null_proportion * f0 / fz

    fdr = np.clip(fdr, 0, 1)

    return fdr
コード例 #41
0
def test_influence_wrapped():
    from pandas import DataFrame
    from pandas.util.testing import assert_series_equal

    d = macrodata.load_pandas().data
    #growth rates
    gs_l_realinv = 400 * np.log(d['realinv']).diff().dropna()
    gs_l_realgdp = 400 * np.log(d['realgdp']).diff().dropna()
    lint = d['realint'][:-1]

    # re-index these because they won't conform to lint
    gs_l_realgdp.index = lint.index
    gs_l_realinv.index = lint.index

    data = dict(const=np.ones_like(lint), lint=lint, lrealgdp=gs_l_realgdp)
    #order is important
    exog = DataFrame(data, columns=['const', 'lrealgdp', 'lint'])

    res = OLS(gs_l_realinv, exog).fit()

    #basic
    # already tested
    #assert_almost_equal(lsdiag['cov.scaled'],
    #                    res.cov_params().values.ravel(), decimal=14)
    #assert_almost_equal(lsdiag['cov.unscaled'],
    #                    res.normalized_cov_params.values.ravel(), decimal=14)

    infl = oi.OLSInfluence(res)

    # smoke test just to make sure it works, results separately tested
    df = infl.summary_frame()
    assert_(isinstance(df, DataFrame))

    #this test is slow
    path = os.path.join(cur_dir, "results", "influence_lsdiag_R.json")
    with open(path, "r") as fp:
        lsdiag = json.load(fp)

    c0, c1 = infl.cooks_distance  #TODO: what's c1, it's pvalues? -ss

    #NOTE: we get a hard-cored 5 decimals with pandas testing
    assert_almost_equal(c0, lsdiag['cooks'], 14)
    assert_almost_equal(infl.hat_matrix_diag, (lsdiag['hat']), 14)
    assert_almost_equal(infl.resid_studentized_internal, lsdiag['std.res'], 14)

    #slow:
    dffits, dffth = infl.dffits
    assert_almost_equal(dffits, lsdiag['dfits'], 14)
    assert_almost_equal(infl.resid_studentized_external, lsdiag['stud.res'],
                        14)

    import pandas
    fn = os.path.join(cur_dir, "results/influence_measures_R.csv")
    infl_r = pandas.read_csv(fn, index_col=0)
    conv = lambda s: 1 if s == 'TRUE' else 0
    fn = os.path.join(cur_dir, "results/influence_measures_bool_R.csv")
    #not used yet:
    #infl_bool_r  = pandas.read_csv(fn, index_col=0,
    #                                converters=dict(zip(lrange(7),[conv]*7)))
    infl_r2 = np.asarray(infl_r)
    #TODO: finish wrapping this stuff
    assert_almost_equal(infl.dfbetas, infl_r2[:, :3], decimal=13)
    assert_almost_equal(infl.cov_ratio, infl_r2[:, 4], decimal=14)
コード例 #42
0
    def fit(self):
        """estimate the model and compute the Anova table

        Returns
        -------
        AnovaResults instance

        """
        y = self.data[self.depvar].values

        # Construct OLS endog and exog from string using patsy
        within = ['C(%s, Sum)' % i for i in self.within]
        subject = 'C(%s, Sum)' % self.subject
        factors = within + [subject]
        x = patsy.dmatrix('*'.join(factors), data=self.data)
        term_slices = x.design_info.term_name_slices
        for key in term_slices:
            ind = np.array([False] * x.shape[1])
            ind[term_slices[key]] = True
            term_slices[key] = np.array(ind)
        term_exclude = [':'.join(factors)]
        ind = _not_slice(term_slices, term_exclude, x.shape[1])
        x = x[:, ind]

        # Fit OLS
        model = OLS(y, x)
        results = model.fit()
        if model.rank < x.shape[1]:
            raise ValueError('Independent variables are collinear.')
        for i in term_exclude:
            term_slices.pop(i)
        for key in term_slices:
            term_slices[key] = term_slices[key][ind]
        params = results.params
        df_resid = results.df_resid
        ssr = results.ssr

        columns = ['F Value', 'Num DF', 'Den DF', 'Pr > F']
        anova_table = pd.DataFrame(np.zeros((0, 4)), columns=columns)

        for key in term_slices:
            if self.subject not in key and key != 'Intercept':
                #  Independen variables are orthogonal
                ssr1, df_resid1 = _ssr_reduced_model(y, x, term_slices, params,
                                                     [key])
                df1 = df_resid1 - df_resid
                msm = (ssr1 - ssr) / df1
                if (key == ':'.join(factors[:-1])
                        or (key + ':' + subject not in term_slices)):
                    mse = ssr / df_resid
                    df2 = df_resid
                else:
                    ssr1, df_resid1 = _ssr_reduced_model(
                        y, x, term_slices, params, [key + ':' + subject])
                    df2 = df_resid1 - df_resid
                    mse = (ssr1 - ssr) / df2
                F = msm / mse
                p = stats.f.sf(F, df1, df2)
                term = key.replace('C(', '').replace(', Sum)', '')
                anova_table.loc[term, 'F Value'] = F
                anova_table.loc[term, 'Num DF'] = df1
                anova_table.loc[term, 'Den DF'] = df2
                anova_table.loc[term, 'Pr > F'] = p

        return AnovaResults(anova_table)
コード例 #43
0
ファイル: ccc_gct.py プロジェクト: lunglungyu/lwc1503
def hacked_gct(x, maxlag, addconst=True, verbose=True):

    #from scipy import stats

    x = np.asarray(x)

    if x.shape[0] <= 3 * maxlag + int(addconst):
        raise ValueError(
            "Insufficient observations. Maximum allowable "
            "lag is {0}".format(int((x.shape[0] - int(addconst)) / 3) - 1))

    resli = {}

    for mlg in range(1, maxlag + 1):
        result = {}
        if verbose:
            print('\nGranger Causality')
            print('number of lags (no zero)', mlg)
        mxlg = mlg

        # create lagmat of both time series
        dta = lagmat2ds(x, mxlg, trim='both', dropex=1)

        #add constant
        if addconst:
            '''dtaown = add_constant(dta[:, 1:(mxlg + 1)], prepend=False)'''
            dtajoint = add_constant(dta[:, 1:], prepend=False)
        else:
            raise NotImplementedError('Not Implemented')
            #dtaown = dta[:, 1:mxlg]
            #dtajoint = dta[:, 1:]

        # Run ols on both models without and with lags of second variable
        '''res2down = OLS(dta[:, 0], dtaown).fit()'''
        res2down = 'skipped'
        res2djoint = OLS(dta[:, 0], dtajoint).fit()

        #print results
        #for ssr based tests see:
        #http://support.sas.com/rnd/app/examples/ets/granger/index.htm
        #the other tests are made-up
        '''
        # Granger Causality test using ssr (F statistic)
        fgc1 = ((res2down.ssr - res2djoint.ssr) /
                res2djoint.ssr / mxlg * res2djoint.df_resid)
        if verbose:
            print('ssr based F test:         F=%-8.4f, p=%-8.4f, df_denom=%d,'
                   ' df_num=%d' % (fgc1,
                                    stats.f.sf(fgc1, mxlg,
                                               res2djoint.df_resid),
                                    res2djoint.df_resid, mxlg))
        result['ssr_ftest'] = (fgc1,
                               stats.f.sf(fgc1, mxlg, res2djoint.df_resid),
                               res2djoint.df_resid, mxlg)

        # Granger Causality test using ssr (ch2 statistic)
        fgc2 = res2down.nobs * (res2down.ssr - res2djoint.ssr) / res2djoint.ssr
        if verbose:
            print('ssr based chi2 test:   chi2=%-8.4f, p=%-8.4f, '
                   'df=%d' % (fgc2, stats.chi2.sf(fgc2, mxlg), mxlg))
        result['ssr_chi2test'] = (fgc2, stats.chi2.sf(fgc2, mxlg), mxlg)

        #likelihood ratio test pvalue:
        lr = -2 * (res2down.llf - res2djoint.llf)
        if verbose:
            print('likelihood ratio test: chi2=%-8.4f, p=%-8.4f, df=%d' %
                   (lr, stats.chi2.sf(lr, mxlg), mxlg))
        result['lrtest'] = (lr, stats.chi2.sf(lr, mxlg), mxlg)
        '''
        # F test that all lag coefficients of exog are zero
        rconstr = np.column_stack((np.zeros(
            (mxlg, mxlg)), np.eye(mxlg, mxlg), np.zeros((mxlg, 1))))
        ftres = res2djoint.f_test(rconstr)
        if verbose:
            print('parameter F test:         F=%-8.4f, p=%-8.4f, df_denom=%d,'
                  ' df_num=%d' %
                  (ftres.fvalue, ftres.pvalue, ftres.df_denom, ftres.df_num))
        result['params_ftest'] = (np.squeeze(ftres.fvalue)[()],
                                  np.squeeze(ftres.pvalue)[()], ftres.df_denom,
                                  ftres.df_num)

        resli[mxlg] = (result, [res2down, res2djoint, rconstr])

    return resli
コード例 #44
0
def plot_partregress(endog,
                     exog_i,
                     exog_others,
                     data=None,
                     title_kwargs={},
                     obs_labels=True,
                     label_kwargs={},
                     ax=None,
                     ret_coords=False,
                     **kwargs):
    """Plot partial regression for a single regressor.

    Parameters
    ----------
    endog : {ndarray, str}
       endogenous or response variable. If string is given, you can use a
       arbitrary translations as with a formula.
    exog_i : {ndarray, str}
        exogenous, explanatory variable. If string is given, you can use a
        arbitrary translations as with a formula.
    exog_others : {ndarray, list[str]}
        other exogenous, explanatory variables. If a list of strings is given,
        each item is a term in formula. You can use a arbitrary translations
        as with a formula. The effect of these variables will be removed by
        OLS regression.
    data : DataFrame, dict, or recarray
        Some kind of data structure with names if the other variables are
        given as strings.
    title_kwargs : dict
        Keyword arguments to pass on for the title. The key to control the
        fonts is fontdict.
    obs_labels : bool or array_like
        Whether or not to annotate the plot points with their observation
        labels. If obs_labels is a boolean, the point labels will try to do
        the right thing. First it will try to use the index of data, then
        fall back to the index of exog_i. Alternatively, you may give an
        array-like object corresponding to the observation numbers.
    labels_kwargs : dict
        Keyword arguments that control annotate for the observation labels.
    ax : Matplotlib AxesSubplot instance, optional
        If given, this subplot is used to plot in instead of a new figure being
        created.
    ret_coords : bool
        If True will return the coordinates of the points in the plot. You
        can use this to add your own annotations.
    **kwargs
        The keyword arguments passed to plot for the points.

    Returns
    -------
    fig : Figure
        If `ax` is None, the created figure.  Otherwise the figure to which
        `ax` is connected.
    coords : list, optional
        If ret_coords is True, return a tuple of arrays (x_coords, y_coords).

    Notes
    -----
    The slope of the fitted line is the that of `exog_i` in the full
    multiple regression. The individual points can be used to assess the
    influence of points on the estimated coefficient.

    See Also
    --------
    plot_partregress_grid : Plot partial regression for a set of regressors.

    Examples
    --------
    Load the Statewide Crime data set and plot partial regression of the rate
    of high school graduation (hs_grad) on the murder rate(murder).

    The effects of the percent of the population living in urban areas (urban),
    below the poverty line (poverty) , and in a single person household (single)
    are removed by OLS regression.

    >>> import statsmodels.api as sm
    >>> import matplotlib.pyplot as plt

    >>> crime_data = sm.datasets.statecrime.load_pandas()
    >>> sm.graphics.plot_partregress(endog='murder', exog_i='hs_grad',
    ...                              exog_others=['urban', 'poverty', 'single'],
    ...                              data=crime_data.data, obs_labels=False)
    >>> plt.show()

    .. plot:: plots/graphics_regression_partregress.py

    More detailed examples can be found in the Regression Plots notebook
    on the examples page.

    """
    #NOTE: there is no interaction between possible missing data and
    #obs_labels yet, so this will need to be tweaked a bit for this case
    fig, ax = utils.create_mpl_ax(ax)

    # strings, use patsy to transform to data
    if isinstance(endog, str):
        endog = dmatrix(endog + "-1", data)

    if isinstance(exog_others, str):
        RHS = dmatrix(exog_others, data)
    elif isinstance(exog_others, list):
        RHS = "+".join(exog_others)
        RHS = dmatrix(RHS, data)
    else:
        RHS = exog_others
    RHS_isemtpy = False
    if isinstance(RHS, np.ndarray) and RHS.size == 0:
        RHS_isemtpy = True
    elif isinstance(RHS, pd.DataFrame) and RHS.empty:
        RHS_isemtpy = True
    if isinstance(exog_i, str):
        exog_i = dmatrix(exog_i + "-1", data)

    # all arrays or pandas-like

    if RHS_isemtpy:
        ax.plot(endog, exog_i, 'o', **kwargs)
        fitted_line = OLS(endog, exog_i).fit()
        x_axis_endog_name = 'x' if isinstance(exog_i,
                                              np.ndarray) else exog_i.name
        y_axis_endog_name = 'y' if isinstance(
            endog, np.ndarray) else endog.design_info.column_names[0]
    else:
        res_yaxis = OLS(endog, RHS).fit()
        res_xaxis = OLS(exog_i, RHS).fit()
        xaxis_resid = res_xaxis.resid
        yaxis_resid = res_yaxis.resid
        x_axis_endog_name = res_xaxis.model.endog_names
        y_axis_endog_name = res_yaxis.model.endog_names
        ax.plot(xaxis_resid, yaxis_resid, 'o', **kwargs)
        fitted_line = OLS(yaxis_resid, xaxis_resid).fit()

    fig = abline_plot(0, fitted_line.params[0], color='k', ax=ax)

    if x_axis_endog_name == 'y':  # for no names regression will just get a y
        x_axis_endog_name = 'x'  # this is misleading, so use x
    ax.set_xlabel("e(%s | X)" % x_axis_endog_name)
    ax.set_ylabel("e(%s | X)" % y_axis_endog_name)
    ax.set_title('Partial Regression Plot', **title_kwargs)

    #NOTE: if we want to get super fancy, we could annotate if a point is
    #clicked using this widget
    #http://stackoverflow.com/questions/4652439/
    #is-there-a-matplotlib-equivalent-of-matlabs-datacursormode/
    #4674445#4674445
    if obs_labels is True:
        if data is not None:
            obs_labels = data.index
        elif hasattr(exog_i, "index"):
            obs_labels = exog_i.index
        else:
            obs_labels = res_xaxis.model.data.row_labels
        #NOTE: row_labels can be None.
        #Maybe we should fix this to never be the case.
        if obs_labels is None:
            obs_labels = lrange(len(exog_i))

    if obs_labels is not False:  # could be array_like
        if len(obs_labels) != len(exog_i):
            raise ValueError("obs_labels does not match length of exog_i")
        label_kwargs.update(dict(ha="center", va="bottom"))
        ax = utils.annotate_axes(lrange(len(obs_labels)),
                                 obs_labels,
                                 lzip(res_xaxis.resid, res_yaxis.resid),
                                 [(0, 5)] * len(obs_labels),
                                 "x-large",
                                 ax=ax,
                                 **label_kwargs)

    if ret_coords:
        return fig, (res_xaxis.resid, res_yaxis.resid)
    else:
        return fig
コード例 #45
0
 def test_norm_resid_zero_variance(self):
     with warnings.catch_warnings(record=True):
         y = self.res1.model.endog
         res = OLS(y, y).fit()
         assert_allclose(res.scale, 0, atol=1e-20)
         assert_allclose(res.wresid, res.resid_pearson, atol=5e-11)
コード例 #46
0
nobs = 100
lb, ub = -1, 2
x = np.linspace(lb, ub, nobs)
x = np.sin(x)
exog = x[:, None]**np.arange(order + 1)
y_true = exog.sum(1)
y = y_true + sigma_noise * np.random.randn(nobs)

#xind = np.argsort(x)
pmod = smoothers.PolySmoother(2, x)
pmod.fit(y)  #no return
y_pred = pmod.predict(x)
error = y - y_pred
mse = (error * error).mean()
print mse
res_ols = OLS(y, exog[:, :3]).fit()
print np.squeeze(pmod.coef) - res_ols.params

weights = np.ones(nobs)
weights[:nobs // 3] = 0.1
weights[-nobs // 5:] = 2

pmodw = smoothers.PolySmoother(2, x)
pmodw.fit(y, weights=weights)  #no return
y_predw = pmodw.predict(x)
error = y - y_predw
mse = (error * error).mean()
print mse
res_wls = WLS(y, exog[:, :3], weights=weights).fit()
print np.squeeze(pmodw.coef) - res_wls.params
コード例 #47
0
 def setup_class(cls):
     data = longley.load(as_pandas=False)
     data.exog = add_constant(data.exog, prepend=False)
     cls.endog = data.endog
     cls.exog = data.exog
     cls.ols_model = OLS(data.endog, data.exog)
コード例 #48
0
z = y_true  #alias check
d = x
y = y_true + sigma_noise * np.random.randn(nobs)

example = 1

if example == 1:
    m = AdditiveModel(d)
    m.fit(y)

    y_pred = m.results.predict(d)

for ss in m.smoothers:
    print(ss.params)

res_ols = OLS(y, exog_reduced).fit()
print(res_ols.params)

#assert_almost_equal(y_pred, res_ols.fittedvalues, 3)

if example > 0:
    import matplotlib.pyplot as plt

    plt.figure()
    plt.plot(exog)

    y_pred = m.results.mu  # + m.results.alpha #m.results.predict(d)
    plt.figure()
    plt.subplot(2, 2, 1)
    plt.plot(y, '.', alpha=0.25)
    plt.plot(y_true, 'k-', label='true')
コード例 #49
0
def linear_regression_orp(db, test_swp, nb):
    from statsmodels.regression.linear_model import OLS
    random_seed = 0
    random.seed(random_seed)

    # Load data
    db.reset_index(inplace=True, drop=True)
    # X_train, y_train, index, cr = orp_db_generator(db, nb = 80)

    filename = "orp_80_biggest2"
    # np_save_training_data(filename,X_train, y_train)
    # print('saved')

    X_train, y_train = np_read_training_data(filename)
    X_test, y_test, index2, created = orp_db_generator(test_swp,
                                                       nb=0,
                                                       bool=True)

    from scipy.stats import norm

    # Build the quantile
    def ols_quantile(m, X, q):
        # m: OLS statsmodels model.
        # X: X matrix.
        # q: Quantile.
        mean_pred = m.predict(X)
        se = np.sqrt(m.scale)
        print(se)
        return mean_pred + norm.ppf(q) * se

    model = OLS(y_train[:].astype(float), X_train.astype(float))
    model = model.fit()
    print('model fitted')
    predictions = pd.DataFrame(y_test)
    predictions['lower'] = ols_quantile(model, X_test.astype(float), 0.48)
    predictions['upper'] = ols_quantile(model, X_test.astype(float), 0.52)

    # Displays the main metrics
    predictions = predictions.set_index(pd.DatetimeIndex(created[0].values))
    predictions.rename(columns={0: 'measures'}, inplace=True)
    axes = predictions.plot(style='.-', color=['blue', 'red', 'green', 'red'])

    print("Mean absolute error " + str(
        mean_absolute_error(y_pred=(predictions['lower'] +
                                    predictions['upper']) / 2,
                            y_true=y_test)))
    print("Quantile loss {}".format(
        full_quantile_loss(y_test,
                           predictions['lower'],
                           predictions['upper'],
                           alpha=0.05)))
    predictions.rename(columns={'measures': 0}, inplace=True)
    print("Coverage {}".format(coverage(predictions)))
    predictions.rename(columns={0: 'measures'}, inplace=True)

    #  Count the number of value under the min interval bound and above the upper interval bound
    zz1 = np.greater_equal(y_test, predictions['upper'])
    zz2 = np.greater_equal(predictions['lower'], y_test)
    print(sum(zz1))
    print(sum(zz2))
    anomalies = [x or y for (x, y) in zip(zz1, zz2)]
    # anomalies = int(anomalies)
    anomalies = list(map(int, anomalies))
    anomalies = [element * 400 for element in anomalies]
    # print(anomalies)
    # anomalies = np.asarray(anomalies)
    anomalies = pd.DataFrame(anomalies)
    # print(anomalies[0].sum())

    # Displays a star for each anomaly
    anomalies = anomalies.set_index(pd.DatetimeIndex(created[0].values))
    anomalies.plot(color='r', marker="*", linewidth=0, ax=axes)

    predictions.rename(columns={'measures': 0}, inplace=True)
    zz1 = np.greater_equal(predictions[0], predictions['upper'])
    zz2 = np.greater_equal(predictions['lower'], predictions[0])
    deriv = [x or y for (x, y) in zip(zz1, zz2)]

    filename = "lin_swp_orp_" + str(nb)
    np.save(filename, deriv)

    filename = "lin_index_orp_" + str(nb)
    np.save(filename, index2)
    predictions.rename(columns={0: 'measures'}, inplace=True)

    return predictions
コード例 #50
0
ファイル: ar_model.py プロジェクト: murphyec/statsmodels
    def fit(self,
            maxlag=None,
            method='cmle',
            ic=None,
            trend='c',
            transparams=True,
            start_params=None,
            solver='lbfgs',
            maxiter=35,
            full_output=1,
            disp=1,
            callback=None,
            **kwargs):
        """
        Fit the unconditional maximum likelihood of an AR(p) process.

        Parameters
        ----------
        maxlag : int
            If `ic` is None, then maxlag is the lag length used in fit.  If
            `ic` is specified then maxlag is the highest lag order used to
            select the correct lag order.  If maxlag is None, the default is
            round(12*(nobs/100.)**(1/4.))
        method : str {'cmle', 'mle'}, optional
            cmle - Conditional maximum likelihood using OLS
            mle - Unconditional (exact) maximum likelihood.  See `solver`
            and the Notes.
        ic : str {'aic','bic','hic','t-stat'}
            Criterion used for selecting the optimal lag length.
            aic - Akaike Information Criterion
            bic - Bayes Information Criterion
            t-stat - Based on last lag
            hqic - Hannan-Quinn Information Criterion
            If any of the information criteria are selected, the lag length
            which results in the lowest value is selected.  If t-stat, the
            model starts with maxlag and drops a lag until the highest lag
            has a t-stat that is significant at the 95 % level.
        trend : str {'c','nc'}
            Whether to include a constant or not. 'c' - include constant.
            'nc' - no constant.

        The below can be specified if method is 'mle'

        transparams : bool, optional
            Whether or not to transform the parameters to ensure stationarity.
            Uses the transformation suggested in Jones (1980).
        start_params : array_like, optional
            A first guess on the parameters.  Default is cmle estimates.
        solver : str or None, optional
            Solver to be used if method is 'mle'.  The default is 'lbfgs'
            (limited memory Broyden-Fletcher-Goldfarb-Shanno).  Other choices
            are 'bfgs', 'newton' (Newton-Raphson), 'nm' (Nelder-Mead),
            'cg' - (conjugate gradient), 'ncg' (non-conjugate gradient),
            and 'powell'.
        maxiter : int, optional
            The maximum number of function evaluations. Default is 35.
        tol : float
            The convergence tolerance.  Default is 1e-08.
        full_output : bool, optional
            If True, all output from solver will be available in
            the Results object's mle_retvals attribute.  Output is dependent
            on the solver.  See Notes for more information.
        disp : bool, optional
            If True, convergence information is output.
        callback : function, optional
            Called after each iteration as callback(xk) where xk is the current
            parameter vector.
        kwargs
            See Notes for keyword arguments that can be passed to fit.

        References
        ----------
        Jones, R.H. 1980 "Maximum likelihood fitting of ARMA models to time
            series with missing observations."  `Technometrics`.  22.3.
            389-95.

        See Also
        --------
        statsmodels.base.model.LikelihoodModel.fit
        """
        method = method.lower()
        if method not in ['cmle', 'yw', 'mle']:
            raise ValueError("Method %s not recognized" % method)
        self.method = method
        self.trend = trend
        self.transparams = transparams
        nobs = len(self.endog)  # overwritten if method is 'cmle'
        endog = self.endog

        if maxlag is None:
            maxlag = int(round(12 * (nobs / 100.)**(1 / 4.)))
        k_ar = maxlag  # stays this if ic is None

        # select lag length
        if ic is not None:
            ic = ic.lower()
            if ic not in ['aic', 'bic', 'hqic', 't-stat']:
                raise ValueError("ic option %s not understood" % ic)
            k_ar = self.select_order(k_ar, ic, trend, method)

        self.k_ar = k_ar  # change to what was chosen by ic

        # redo estimation for best lag
        # make LHS
        Y = endog[k_ar:, :]
        # make lagged RHS
        X = self._stackX(k_ar, trend)  # sets self.k_trend
        k_trend = self.k_trend
        self.exog_names = util.make_lag_names(self.endog_names, k_ar, k_trend)
        self.Y = Y
        self.X = X

        if method == "cmle":  # do OLS
            arfit = OLS(Y, X).fit()
            params = arfit.params
            self.nobs = nobs - k_ar
            self.sigma2 = arfit.ssr / arfit.nobs  # needed for predict fcasterr

        elif method == "mle":
            solver = solver.lower()
            self.nobs = nobs
            if start_params is None:
                start_params = OLS(Y, X).fit().params
            else:
                if len(start_params) != k_trend + k_ar:
                    raise ValueError("Length of start params is %d. There"
                                     " are %d parameters." %
                                     (len(start_params), k_trend + k_ar))
            start_params = self._invtransparams(start_params)
            if solver == 'lbfgs':
                kwargs.setdefault('pgtol', 1e-8)
                kwargs.setdefault('factr', 1e2)
                kwargs.setdefault('m', 12)
                kwargs.setdefault('approx_grad', True)
            mlefit = super(AR, self).fit(start_params=start_params,
                                         method=solver,
                                         maxiter=maxiter,
                                         full_output=full_output,
                                         disp=disp,
                                         callback=callback,
                                         **kwargs)

            params = mlefit.params
            if self.transparams:
                params = self._transparams(params)
                self.transparams = False  # turn off now for other results

        # don't use yw, because we can't estimate the constant
        #elif method == "yw":
        #    params, omega = yule_walker(endog, order=maxlag,
        #            method="mle", demean=False)
        #    # how to handle inference after Yule-Walker?
        #    self.params = params #TODO: don't attach here
        #    self.omega = omega

        pinv_exog = np.linalg.pinv(X)
        normalized_cov_params = np.dot(pinv_exog, pinv_exog.T)
        arfit = ARResults(self, params, normalized_cov_params)
        if method == 'mle' and full_output:
            arfit.mle_retvals = mlefit.mle_retvals
            arfit.mle_settings = mlefit.mle_settings
        return ARResultsWrapper(arfit)
コード例 #51
0
 def calc_ols_rsquared(df, idx):
     return OLS(df.iloc[:, idx],
                df.loc[:, np.arange(df.shape[1]) != idx]).fit().rsquared
コード例 #52
0
 def setupClass(cls):
     super(TestNxNxOne, cls).setupClass()
     cls.mod2 = OLS(cls.endog_n_, cls.exog_n_one)
     cls.mod2.df_model += 1
     cls.res2 = cls.mod2.fit()
コード例 #53
0
 def setup_class(cls):
     data = stackloss.load(as_pandas=False)
     data.exog = add_constant(data.exog)
     cls.res1 = OLS(data.endog, data.exog).fit()
     cls.res2 = RegressionResults()
コード例 #54
0
ファイル: performance.py プロジェクト: htq310542/alphalens
def factor_alpha_beta(factor_data,
                      returns=None,
                      demeaned=True,
                      group_adjust=False,
                      equal_weight=False):
    """
    Compute the alpha (excess returns), alpha t-stat (alpha significance),
    and beta (market exposure) of a factor. A regression is run with
    the period wise factor universe mean return as the independent variable
    and mean period wise return from a portfolio weighted by factor values
    as the dependent variable.

    Parameters
    ----------
    factor_data : pd.DataFrame - MultiIndex
        A MultiIndex DataFrame indexed by date (level 0) and asset (level 1),
        containing the values for a single alpha factor, forward returns for
        each period, the factor quantile/bin that factor value belongs to, and
        (optionally) the group the asset belongs to.
        - See full explanation in utils.get_clean_factor_and_forward_returns
    returns : pd.DataFrame, optional
        Period wise factor returns. If this is None then it will be computed
        with 'factor_returns' function and the passed flags: 'demeaned',
        'group_adjust', 'equal_weight'
    demeaned : bool
        Control how to build factor returns used for alpha/beta computation
        -- see performance.factor_return for a full explanation
    group_adjust : bool
        Control how to build factor returns used for alpha/beta computation
        -- see performance.factor_return for a full explanation
    equal_weight : bool, optional
        Control how to build factor returns used for alpha/beta computation
        -- see performance.factor_return for a full explanation

    Returns
    -------
    alpha_beta : pd.Series
        A list containing the alpha, beta, a t-stat(alpha)
        for the given factor and forward returns.
    """

    if returns is None:
        returns = \
            factor_returns(factor_data, demeaned, group_adjust, equal_weight)

    universe_ret = factor_data.groupby(level='date')[
        utils.get_forward_returns_columns(factor_data.columns)] \
        .mean().loc[returns.index]

    if isinstance(returns, pd.Series):
        returns.name = universe_ret.columns.values[0]
        returns = pd.DataFrame(returns)

    alpha_beta = pd.DataFrame()
    for period in returns.columns.values:
        x = universe_ret[period].values
        y = returns[period].values
        x = add_constant(x)

        reg_fit = OLS(y, x).fit()
        try:
            alpha, beta = reg_fit.params
        except ValueError:
            alpha_beta.loc['Ann. alpha', period] = np.nan
            alpha_beta.loc['beta', period] = np.nan
        else:
            freq_adjust = pd.Timedelta('252Days') / pd.Timedelta(period)

            alpha_beta.loc['Ann. alpha', period] = \
                (1 + alpha) ** freq_adjust - 1
            alpha_beta.loc['beta', period] = beta

    return alpha_beta
コード例 #55
0
def ax_regress(ax,
               x,
               vector,
               display='equation',
               pos_xy=[0.1, 0.9],
               args_pt={'ls': '-'},
               args_ln={'color': 'k'},
               args_ci={
                   'color': 'k',
                   'alpha': 0.2
               },
               args_tx={'color': 'k'}):
    """
    Plot the time series with trend.

    Parameters
    ----------
    ax: matplotlib.pyplot.axis
    x: 1-d array
        The x-values in the regression.
    vector: 1-d array
        The y-values in the regression.
    display: None or str
        If None, does not display the regression equation.
        If 'equation', display the regression equation.
        If 'pearson', display the Pearson correlation.
    pos_xy: [float, float]
        The position to place the annotation in the normalized axis unit.
    args_pt, args_ln, args_ci, args_txt: dict
        Keyword arguments to be passed into the scatter plot, regression
        line, confidence interval for the regression line, and annotation
        text plotting functions.
    """
    temp = (~np.isnan(vector)) & (~np.isnan(x))
    x = x[temp]
    vector = vector[temp]

    ax.plot(x, vector, **args_pt)

    reg = OLS(vector, add_constant(x)).fit()
    ax.plot(x, x * reg.params[1] + reg.params[0], **args_ln)

    _, predict_ci_low, predict_ci_upp = wls_prediction_std(reg, \
        exog = reg.model.exog, weights = np.ones(len(reg.model.exog)))
    x_ind = np.argsort(x)
    ax.fill_between(x[x_ind],
                    predict_ci_low[x_ind],
                    predict_ci_upp[x_ind],
                    interpolate=True,
                    **args_ci)

    if display == 'equation':
        ax.text(pos_xy[0],
                pos_xy[1],
                ppf(reg.params[1], reg.params[0], reg.pvalues[1],
                    reg.pvalues[0]),
                transform=ax.transAxes,
                **args_tx)
    elif display == 'pearson':
        r, pval = pearsonr(x, vector)
        ax.text(pos_xy[0],
                pos_xy[1], ('%.3f' % r) + ppp(pval),
                transform=ax.transAxes,
                **args_tx)
コード例 #56
0
ファイル: johansen_test.py プロジェクト: xiaojay/AlgoRepo
def detrend(y, order):
    if order == -1:
        return y
    return OLS(y, np.vander(np.linspace(-1, 1, len(y)), order + 1)).fit().resid
コード例 #57
0
def perform_regional_gwas_helper(outfile,
                                 pheno_and_covars_fname,
                                 shared_covars_fname,
                                 untransformed_phenotypes_fname,
                                 get_genotype_iter,
                                 phenotype,
                                 binary,
                                 region,
                                 runtype,
                                 conditional_covars_fname=None):

    outfile.write("chrom\tpos\talleles\tlocus_filtered\t"
                  f"p_{phenotype}\tcoeff_{phenotype}\t")
    if binary != 'logistic':
        outfile.write(f'se_{phenotype}\tR^2\t')
    else:
        outfile.write("unused_col\tunused_col\t")
    outfile.flush()

    n_loci = 0
    batch_time = 0
    batch_size = 50
    total_time = 0

    pheno_specific_covars = np.load(pheno_and_covars_fname)
    shared_covars = np.load(shared_covars_fname)
    covars = utils.merge_arrays(pheno_specific_covars, shared_covars)

    if conditional_covars_fname:
        gt_covars = np.load(conditional_covars_fname)
        covars = utils.merge_arrays(covars, gt_covars)

    # order samples according to order in genetics files
    bgen_samples = sample_utils.get_all_samples()
    assert len(bgen_samples) == 487409
    samples_array = np.array(bgen_samples, dtype=float).reshape(-1, 1)
    merge = utils.merge_arrays(samples_array, covars)
    unfiltered_samples = ~np.isnan(merge[:, 1])

    outcome = merge[unfiltered_samples, 1].copy()
    covars = merge[unfiltered_samples, :]
    covars = (covars - np.mean(covars, axis=0)) / np.std(covars, axis=0)
    covars[:, 1] = 1  # reuse the column that was the outcome as the intercept

    ori_phenotypes = np.load(untransformed_phenotypes_fname)
    ori_phenotypes = utils.merge_arrays(samples_array, ori_phenotypes)[:, 1]
    ori_phenotypes = ori_phenotypes[unfiltered_samples]

    # first yield is special
    genotype_iter = get_genotype_iter(unfiltered_samples)
    extra_detail_fields = next(genotype_iter)
    outfile.write('\t'.join(extra_detail_fields) + '\t')

    if not binary:
        stat = 'mean'
    else:
        stat = 'fraction'

    outfile.write(f'{stat}_{phenotype}_per_single_dosage\t'
                  '0.05_significance_CI\t'
                  '5e-8_significance_CI')

    if runtype == 'strs':
        outfile.write('\ttotal_subset_dosage_per_summed_gt\t'
                      f'{stat}_{phenotype}_per_paired_dosage\t'
                      '0.05_significance_CI\t'
                      '5e-8_significance_CI')
    outfile.write('\n')
    outfile.flush()

    start_time = time.time()
    for dosage_gts, unique_alleles, chrom, pos, locus_filtered, locus_details in genotype_iter:
        assert len(locus_details) == len(extra_detail_fields)

        covars[:,
               0] = np.nan  # reuse the column that was the ids as the genotypes

        n_loci += 1
        allele_names = ','.join(list(unique_alleles.astype(str)))
        outfile.write(f"{chrom}\t{pos}\t{allele_names}\t")
        if locus_filtered:
            outfile.write(f'{locus_filtered}\t1\tnan\tnan\tnan\t')
            outfile.write('\t'.join(locus_details))
            if runtype == 'strs':
                outfile.write('\tnan' * 6 + '\n')
            else:
                outfile.write('\tnan' * 3 + '\n')
            outfile.flush()
            continue
        else:
            outfile.write('False\t')

        if runtype == 'strs':
            gts = np.sum([
                _len * np.sum(dosages, axis=1)
                for _len, dosages in dosage_gts.items()
            ],
                         axis=0)
        else:
            gts = dosage_gts[:, 1] + 2 * dosage_gts[:, 2]
        std = np.std(gts)
        gts = (gts - np.mean(gts)) / np.std(gts)
        covars[:, 0] = gts

        if not binary or binary == 'linear':
            #do da regression
            model = OLS(
                outcome,
                covars,
                missing='drop',
            )
            reg_result = model.fit()
            pval = reg_result.pvalues[0]
            coef = reg_result.params[0]
            se = reg_result.bse[0]
            rsquared = reg_result.rsquared
            outfile.write(f"{pval:.2e}\t{coef/std}\t{se/std}\t{rsquared}\t")
        else:
            model = sm.GLM(outcome,
                           covars,
                           missing='drop',
                           family=sm.families.Binomial())
            reg_result = model.fit()
            pval = reg_result.pvalues[0]
            coef = reg_result.params[0]
            outfile.write(f'{pval:.2e}\t{coef/std}\tnan\tnan\t')

        outfile.write('\t'.join(locus_details) + '\t')

        if runtype == 'strs':
            single_dosages = {}

            paired_dosages = {}
            for len1 in unique_alleles:
                for len2 in unique_alleles:
                    if len1 > len2:
                        continue
                    if len1 != len2:
                        dosages = (
                            dosage_gts[len1][:, 0] * dosage_gts[len2][:, 1] +
                            dosage_gts[len1][:, 1] * dosage_gts[len2][:, 0])
                    else:
                        dosages = dosage_gts[len1][:, 0] * dosage_gts[len1][:,
                                                                            1]
                    if np.sum(dosages) <= 0:
                        continue
                    summed_len = round(len1 + len2, 2)
                    if summed_len not in single_dosages:
                        single_dosages[summed_len] = dosages
                    else:
                        single_dosages[summed_len] += dosages
                    minlen = min(len1, len2)
                    maxlen = max(len1, len2)
                    paired_dosages[(minlen, maxlen)] = dosages
            single_dosage_stat = {}
            single_dosage_95_CI = {}
            single_dosage_GWAS_CI = {}
            paired_dosage_stat = {}
            paired_dosage_95_CI = {}
            paired_dosage_GWAS_CI = {}
            if not binary:
                for _len, dosages in single_dosages.items():
                    if len(np.unique(ori_phenotypes[dosages != 0])) <= 1:
                        continue
                    mean_stats = statsmodels.stats.weightstats.DescrStatsW(
                        ori_phenotypes, weights=dosages)
                    single_dosage_stat[_len] = mean_stats.mean
                    single_dosage_95_CI[_len] = mean_stats.tconfint_mean()
                    single_dosage_GWAS_CI[_len] = mean_stats.tconfint_mean(
                        5e-8)
                for _len, dosages in paired_dosages.items():
                    if len(np.unique(ori_phenotypes[dosages != 0])) <= 1:
                        continue
                    mean_stats = statsmodels.stats.weightstats.DescrStatsW(
                        ori_phenotypes, weights=dosages)
                    paired_dosage_stat[_len] = mean_stats.mean
                    paired_dosage_95_CI[_len] = mean_stats.tconfint_mean()
                    paired_dosage_GWAS_CI[_len] = mean_stats.tconfint_mean(
                        5e-8)
            else:
                for _len, dosages in single_dosages.items():
                    if not np.any(dosages != 0):
                        continue
                    p, lower, upper = weighted_binom_conf.weighted_binom_conf(
                        dosages, ori_phenotypes, 0.05)
                    single_dosage_stat[_len] = p
                    single_dosage_95_CI[_len] = (lower, upper)
                    _, lower_gwas, upper_gwas = weighted_binom_conf.weighted_binom_conf(
                        dosages, ori_phenotypes, 5e-8)
                    single_dosage_GWAS_CI[_len] = (lower_gwas, upper_gwas)
                for _len, dosages in paired_dosages.items():
                    if not np.any(dosages != 0):
                        continue
                    p, lower, upper = weighted_binom_conf.weighted_binom_conf(
                        dosages, ori_phenotypes, 0.05)
                    paired_dosage_stat[_len] = p
                    paired_dosage_95_CI[_len] = (lower, upper)
                    _, lower_gwas, upper_gwas = weighted_binom_conf.weighted_binom_conf(
                        dosages, ori_phenotypes, 5e-8)
                    paired_dosage_GWAS_CI[_len] = (lower_gwas, upper_gwas)
            outfile.write(
                load_and_filter_genotypes.dict_str(single_dosage_stat) + '\t')
            outfile.write(
                load_and_filter_genotypes.dict_str(single_dosage_95_CI) + '\t')
            outfile.write(
                load_and_filter_genotypes.dict_str(single_dosage_GWAS_CI) +
                '\t')
            outfile.write(
                load_and_filter_genotypes.dict_str(
                    {key: np.sum(arr)
                     for key, arr in single_dosages.items()}) + '\t')
            outfile.write(
                load_and_filter_genotypes.dict_str(paired_dosage_stat) + '\t')
            outfile.write(
                load_and_filter_genotypes.dict_str(paired_dosage_95_CI) + '\t')
            outfile.write(
                load_and_filter_genotypes.dict_str(paired_dosage_GWAS_CI) +
                '\n')
        else:
            single_dosage_stat = {}
            single_dosage_95_CI = {}
            single_dosage_GWAS_CI = {}
            if not binary:
                for alt_count in range(3):
                    mean_stats = statsmodels.stats.weightstats.DescrStatsW(
                        ori_phenotypes, weights=dosage_gts[:, alt_count])
                    single_dosage_stat[alt_count] = mean_stats.mean
                    single_dosage_95_CI[alt_count] = mean_stats.tconfint_mean()
                    single_dosage_GWAS_CI[
                        alt_count] = mean_stats.tconfint_mean(5e-8)
            else:
                for alt_count in range(3):
                    p, lower, upper = weighted_binom_conf.weighted_binom_conf(
                        dosage_gts[:, alt_count], ori_phenotypes, 0.05)
                    single_dosage_stat[alt_count] = p
                    single_dosage_95_CI[alt_count] = (lower, upper)
                    _, lower_gwas, upper_gwas = weighted_binom_conf.weighted_binom_conf(
                        dosage_gts[:, alt_count], ori_phenotypes, 5e-8)
                    single_dosage_GWAS_CI[alt_count] = (lower_gwas, upper_gwas)
            outfile.write(
                load_and_filter_genotypes.dict_str(single_dosage_stat) + '\t')
            outfile.write(
                load_and_filter_genotypes.dict_str(single_dosage_95_CI) + '\t')
            outfile.write(
                load_and_filter_genotypes.dict_str(single_dosage_GWAS_CI) +
                '\n')

        outfile.flush()

        duration = time.time() - start_time
        total_time += duration
        batch_time += duration
        if n_loci % batch_size == 0:
            print(
                f"time/locus (last {batch_size}): "
                f"{batch_time/batch_size}s\n"
                f"time/locus ({n_loci} total loci): {total_time/n_loci}s\n",
                flush=True)
            batch_time = 0
        start_time = time.time()
    if n_loci > 0:
        print(
            f"Done.\nTotal loci: {n_loci}\nTotal time: {total_time}s\ntime/locus: {total_time/n_loci}s\n",
            flush=True)
    else:
        print(f"No variants found in the region {region}\n", flush=True)
コード例 #58
0
def qqline(ax, line, x=None, y=None, dist=None, fmt="r-", **lineoptions):
    """
    Plot a reference line for a qqplot.

    Parameters
    ----------
    ax : matplotlib axes instance
        The axes on which to plot the line
    line : str {"45","r","s","q"}
        Options for the reference line to which the data is compared.:

        - "45" - 45-degree line
        - "s"  - standardized line, the expected order statistics are scaled by
                 the standard deviation of the given sample and have the mean
                 added to them
        - "r"  - A regression line is fit
        - "q"  - A line is fit through the quartiles.
        - None - By default no reference line is added to the plot.

    x : ndarray
        X data for plot. Not needed if line is "45".
    y : ndarray
        Y data for plot. Not needed if line is "45".
    dist : scipy.stats.distribution
        A scipy.stats distribution, needed if line is "q".
    fmt : str, optional
        Line format string passed to `plot`.
    **lineoptions
        Additional arguments to be passed to the `plot` command.

    Notes
    -----
    There is no return value. The line is plotted on the given `ax`.

    Examples
    --------
    Import the food expenditure dataset.  Plot annual food expenditure on x-axis
    and household income on y-axis.  Use qqline to add regression line into the
    plot.

    >>> import statsmodels.api as sm
    >>> import numpy as np
    >>> import matplotlib.pyplot as plt
    >>> from statsmodels.graphics.gofplots import qqline

    >>> foodexp = sm.datasets.engel.load()
    >>> x = foodexp.exog
    >>> y = foodexp.endog
    >>> ax = plt.subplot(111)
    >>> plt.scatter(x, y)
    >>> ax.set_xlabel(foodexp.exog_name[0])
    >>> ax.set_ylabel(foodexp.endog_name)
    >>> qqline(ax, "r", x, y)
    >>> plt.show()

    .. plot:: plots/graphics_gofplots_qqplot_qqline.py
    """
    lineoptions = lineoptions.copy()
    for ls in ("-", "--", "-.", ":"):
        if ls in fmt:
            lineoptions.setdefault("linestyle", ls)
            fmt = fmt.replace(ls, "")
            break
    for marker in (
            ".",
            ",",
            "o",
            "v",
            "^",
            "<",
            ">",
            "1",
            "2",
            "3",
            "4",
            "8",
            "s",
            "p",
            "P",
            "*",
            "h",
            "H",
            "+",
            "x",
            "X",
            "D",
            "d",
            "|",
            "_",
    ):
        if marker in fmt:
            lineoptions.setdefault("marker", marker)
            fmt = fmt.replace(marker, "")
            break
    if fmt:
        lineoptions.setdefault("color", fmt)

    if line == "45":
        end_pts = lzip(ax.get_xlim(), ax.get_ylim())
        end_pts[0] = min(end_pts[0])
        end_pts[1] = max(end_pts[1])
        ax.plot(end_pts, end_pts, **lineoptions)
        ax.set_xlim(end_pts)
        ax.set_ylim(end_pts)
        return  # does this have any side effects?
    if x is None or y is None:
        raise ValueError("If line is not 45, x and y cannot be None.")
    x = np.array(x)
    y = np.array(y)
    if line == "r":
        # could use ax.lines[0].get_xdata(), get_ydata(),
        # but don't know axes are "clean"
        y = OLS(y, add_constant(x)).fit().fittedvalues
        ax.plot(x, y, **lineoptions)
    elif line == "s":
        m, b = np.std(y), np.mean(y)
        ref_line = x * m + b
        ax.plot(x, ref_line, **lineoptions)
    elif line == "q":
        _check_for(dist, "ppf")
        q25 = stats.scoreatpercentile(y, 25)
        q75 = stats.scoreatpercentile(y, 75)
        theoretical_quartiles = dist.ppf([0.25, 0.75])
        m = (q75 - q25) / np.diff(theoretical_quartiles)
        b = q25 - m * theoretical_quartiles[0]
        ax.plot(x, m * x + b, **lineoptions)
コード例 #59
0
    def start(self):
        self.print_arguments()

        print("### STEP 1 ###")
        print("Loading genotype data and dataset info.")
        geno_df = self.load_file(self.geno_path, header=0, index_col=0, nrows=self.nrows)

        dataset_mask = np.ones(geno_df.shape[1], dtype=bool)
        if self.std_path is not None:
            std_df = self.load_file(self.std_path, header=0, index_col=None)

            # Validate that the input data matches.
            self.validate_data(std_df=std_df,
                               geno_df=geno_df)

            # Filter on datasets.
            if self.datasets is not None:
                print("Filtering datasets.")
                dataset_mask = std_df["dataset"].isin(self.datasets).to_numpy()
                std_df = std_df.loc[dataset_mask, :]
                geno_df = geno_df.loc[:, dataset_mask]
        else:
            # Create sample-to-dataset file with all the samples having the
            # same dataset.
            std_df = pd.DataFrame({"sample": geno_df.columns, "dataset": "None"})

        print("Checking dataset sample sizes")
        dataset_sample_counts = list(zip(*np.unique(std_df.iloc[:, 1], return_counts=True)))
        dataset_sample_counts.sort(key=lambda x: -x[1])
        datasets = [x[0] for x in dataset_sample_counts]
        max_dataset_length = np.max([len(str(dataset[0])) for dataset in dataset_sample_counts])
        for dataset, sample_size in dataset_sample_counts:
            print("\t{:{}s}  {:,} samples".format(dataset, max_dataset_length, sample_size))
        if dataset_sample_counts[-1][1] <= 1:
            print("\t  One or more datasets have a smaller sample "
                  "size than recommended. Consider excluded these")
            exit()

        # Construct dataset df.
        dataset_df = self.construct_dataset_df(std_df=std_df)

        print("Calculating genotype call rate per dataset")
        geno_df, call_rate_df = self.calculate_call_rate(geno_df=geno_df,
                                                         std_df=std_df,
                                                         datasets=datasets)
        call_rate_n_skipped = (call_rate_df.min(axis=1) < self.call_rate).sum()
        if call_rate_n_skipped > 0:
            print("\t{:,} eQTLs have had dataset(s) filled with NaN "
                  "values due to call rate threshold ".format(call_rate_n_skipped))

        print("Calculating genotype stats for inclusing criteria")
        cr_keep_mask = ~(geno_df == self.genotype_na).all(axis=1).to_numpy(dtype=bool)
        geno_stats_df = pd.DataFrame(np.nan, index=geno_df.index, columns=["N", "NaN", "0", "1", "2", "min GS", "HW pval", "allele1", "allele2", "MA", "MAF"])
        geno_stats_df["N"] = 0
        geno_stats_df["NaN"] = geno_df.shape[1]
        geno_stats_df.loc[cr_keep_mask, :] = self.calculate_genotype_stats(df=geno_df.loc[cr_keep_mask, :])

        # Checking which eQTLs pass the requirements
        n_keep_mask = (geno_stats_df.loc[:, "N"] >= 6).to_numpy()
        hwpval_keep_mask = (geno_stats_df.loc[:, "HW pval"] >= self.hw_pval).to_numpy()
        maf_keep_mask = (geno_stats_df.loc[:, "MAF"] > self.maf).to_numpy()
        combined_keep_mask = n_keep_mask & hwpval_keep_mask & maf_keep_mask
        geno_n_skipped = np.size(combined_keep_mask) - np.sum(combined_keep_mask)
        if geno_n_skipped > 0:
            print("\t{:,} eQTL(s) failed the sample size threshold".format(np.size(n_keep_mask) - np.sum(n_keep_mask)))
            print("\t{:,} eQTL(s) failed the Hardy-Weinberg p-value threshold".format(np.size(hwpval_keep_mask) - np.sum(hwpval_keep_mask)))
            print("\t{:,} eQTL(s) failed the MAF threshold".format(np.size(maf_keep_mask) - np.sum(maf_keep_mask)))
            print("\t----------------------------------------")
            print("\t{:,} eQTL(s) are discarded in total".format(geno_n_skipped))

        # Add mask to genotype stats data frame.
        geno_stats_df["mask"] = 0
        geno_stats_df.loc[combined_keep_mask, "mask"] = 1

        self.save_file(df=call_rate_df, outpath=os.path.join(self.outdir, "call_rate.txt.gz"))
        self.save_file(df=geno_stats_df, outpath=os.path.join(self.outdir, "geno_stats.txt.gz"))

        del call_rate_df, geno_stats_df

        if geno_n_skipped == self.nrows:
            print("Error, no valid eQTLs.")
            exit()

        print("")

        ########################################################################

        print("### STEP 2 ###")
        print("Loading other data.")
        alle_df = self.load_file(self.alle_path, header=0, index_col=0, nrows=self.nrows)
        expr_df = self.load_file(self.expr_path, header=0, index_col=0, nrows=self.nrows)
        cova_df = self.load_file(self.cova_path, header=0, index_col=0)

        # Transpose if need be. We want samples always as columns.
        if cova_df.shape[0] == np.size(dataset_mask):
            print("\t  Transposing covariate matrix.")
            cova_df = cova_df.T

        # Filter the datasets.
        if dataset_mask is not None:
            expr_df = expr_df.loc[:, dataset_mask]
            cova_df = cova_df.loc[:, dataset_mask]

        # Select eQTL rows that meet requirements.
        geno_df = geno_df.loc[combined_keep_mask, :]
        alle_df = alle_df.loc[combined_keep_mask, :]
        expr_df = expr_df.loc[combined_keep_mask, :]

        print("\tValidating input.")
        self.validate_data(std_df=std_df,
                           geno_df=geno_df,
                           alle_df=alle_df,
                           expr_df=expr_df,
                           cova_df=cova_df)
        print("", flush=True)
        del std_df

        ########################################################################

        print("### STEP 3 ###")
        print("Pre-processing data.")
        # Add the allele assed column.
        alle_df["AlleleAssessed"] = alle_df["Alleles"].str.split("/", n=None, expand=True)[1]
        alle_df.drop(["AltAllele"], axis=1, inplace=True)
        alle_df.reset_index(drop=True, inplace=True)

        # Convert to numpy for speed.
        geno_m = geno_df.to_numpy(np.float64)
        expr_m = expr_df.to_numpy(np.float64)
        dataset_m = dataset_df.to_numpy(np.uint8)
        cova_m = cova_df.to_numpy(np.float64)

        # Replace missing values with nan
        geno_m[geno_m == self.genotype_na] = np.nan
        cova_m[cova_m == self.covariate_na] = np.nan

        # Save properties.
        snps = list(geno_df.index)
        genes = list(expr_df.index)
        covariates = list(cova_df.index)
        datasets = list(dataset_df.columns)
        del geno_df, expr_df, dataset_df, cova_df

        # Print info.
        n_eqtls = geno_m.shape[0]
        n_samples = geno_m.shape[1]
        n_covariates = cova_m.shape[0]
        print("Summary stats:")
        print("\tN-eQTLs: {:,}".format(n_eqtls))
        print("\tN-samples: {:,}".format(n_samples))
        print("\tN-covariates: {:,}".format(n_covariates))
        print("\tN-datasets: {:,}".format(len(datasets)))
        print("", flush=True)

        ########################################################################

        print("### STEP 4 ###")
        print("Analyzing eQTLs.")

        # Initializing output matrices / arrays.
        ieqtl_results = {cov: np.empty((n_eqtls, 14), dtype=np.float64) for cov
                         in covariates}

        # Start loop.
        start_time = int(time.time())
        last_print_time = None
        for eqtl_index in range(n_eqtls):
            # Print update for user.
            now_time = int(time.time())
            if n_eqtls > 1 and (last_print_time is None or (now_time - last_print_time) >= self.print_interval or eqtl_index == (n_eqtls - 1)):
                print("\t[{}] {:,}/{:,} eQTLs analysed [{:.2f}%]".format(time.strftime('%H:%M:%S', time.gmtime(now_time - start_time)),
                                                                         eqtl_index,
                                                                         (n_eqtls - 1),
                                                                         (100 / (n_eqtls - 1)) * eqtl_index),
                      flush=True)
                last_print_time = now_time

            # Get the genotype.
            genotype = geno_m[eqtl_index, :]

            for cov_index, cov in enumerate(covariates):
                # Get the covariate.
                covariate = cova_m[cov_index, :]

                # Construct the mask to remove missing values.
                mask = np.logical_and(~np.isnan(genotype), ~np.isnan(covariate))
                n = np.sum(mask)

                # Create the matrix.
                X = np.empty((n, 4), np.float32)
                X[:, 0] = 1
                X[:, 1] = genotype[mask]
                X[:, 2] = cova_m[cov_index, mask]
                X[:, 3] = X[:, 1] * X[:, 2]

                # Get the expression.
                y = expr_m[eqtl_index, mask]

                # Check if there is variance on each column. Also check
                # if each column is unique.
                if (np.min(np.std(X[:, 1:], axis=0)) == 0) or (np.unique(X, axis=1).shape[1] != 4):
                    ieqtl_results[cov][eqtl_index, :] = np.array([n] + [np.nan] * 13)
                    continue

                if self.dataset_correct:
                    # Correct expression for dataset differences.
                    dataset_subset_m = dataset_m[mask, :].copy()
                    dataset_subset_m = dataset_subset_m[:, np.sum(dataset_subset_m, axis=0) > 0]
                    corr_m = np.hstack((X[:, [0]], dataset_subset_m[:, 1:], dataset_subset_m * genotype[mask][:, np.newaxis]))
                    y = OLS(y, corr_m).fit().resid

                # First calculate the rss for the matrix minux the interaction
                # term.
                rss_null = self.calc_rss(y=y,
                                         y_hat=self.fit_and_predict(X=X[:, :3],
                                                                    y=y))

                # Calculate the rss for the interaction model.
                inv_m = self.inverse(X)
                betas = self.fit(X=X,
                                 y=y,
                                 inv_m=inv_m)
                rss_alt = self.calc_rss(y=y,
                                        y_hat=self.predict(X=X,
                                                           betas=betas))
                std = self.calc_std(rss=rss_alt,
                                    n=n,
                                    df=4,
                                    inv_m=inv_m)

                # Calculate interaction p-value.
                p_value = self.calc_p_value(rss1=rss_null,
                                            rss2=rss_alt,
                                            df1=3,
                                            df2=4,
                                            n=n)

                # Calculate the t-values.
                t_values = betas / std

                # Save results.
                ieqtl_results[cov][eqtl_index, :] = np.hstack((np.array([n]),
                                                               betas,
                                                               std,
                                                               t_values,
                                                               np.array([p_value])))
        print("", flush=True)

        ########################################################################

        print("### STEP 5 ###")
        print("Saving results.")

        for covariate in covariates:
            print("  {}:".format(covariate))
            # Convert to pandas data frame.
            df = pd.DataFrame(ieqtl_results[covariate],
                              columns=["N",
                                       "beta-intercept",
                                       "beta-genotype",
                                       "beta-covariate",
                                       "beta-interaction",
                                       "std-intercept",
                                       "std-genotype",
                                       "std-covariate",
                                       "std-interaction",
                                       "tvalue-intercept",
                                       "tvalue-genotype",
                                       "tvalue-covariate",
                                       "tvalue-interaction",
                                       "p-value"]
                              )

            df = pd.concat([alle_df, df], axis=1)
            df.insert(0, "ProbeName", genes)
            df.insert(0, "SNPName", snps)
            df["FDR"] = np.nan
            df.loc[~df["p-value"].isnull(), "FDR"] = multitest.multipletests(df.loc[~df["p-value"].isnull(), "p-value"], method='fdr_bh')[1]
            print("\t{:,} ieQTLs (p-value <0.05)".format(df.loc[df["p-value"] < 0.05, :].shape[0]))
            print("\t{:,} ieQTLs (BH-FDR <0.05)".format(df.loc[df["FDR"] < 0.05, :].shape[0]))

            # Save.
            self.save_file(df=df,
                           outpath=os.path.join(self.outdir, "{}_InteractionResults.txt.gz".format(covariate.replace(" ", ""))),
                           index=False)

            # tmp.
            df["chr"] = [int(x.split(":")[0]) for x in df["SNPName"]]
            tested_counts = df["chr"].value_counts()
            signif_counts = df.loc[df["FDR"] < 0.05, "chr"].value_counts()

            print("")
            print("  Hits per chromosome:")
            for i in range(1, 23):
                n_tested = 0
                if i in tested_counts.index:
                    n_tested = tested_counts[i]

                n_signif = 0
                if i in signif_counts.index:
                    n_signif = signif_counts[i]

                perc = 0
                if n_tested > 0:
                    perc = (100 / n_tested) * n_signif

                print("\t{}: {:,} / {:,} [{:.2f}%]".format(i, n_signif, n_tested, perc))

            print("", flush=True)
コード例 #60
0
 def setupClass(cls):
     data = longley.load()
     data.exog = add_constant(data.exog, prepend=False)
     cls.res1 = GLS(data.endog, data.exog).fit()
     cls.res2 = OLS(data.endog, data.exog).fit()