コード例 #1
0
    def test_fixed_scale(self):
        cov_type = 'fixed_scale'
        kwds = {}
        res1 = self.res_ols.get_robustcov_results(cov_type, **kwds)
        res2 = self.res_wls.get_robustcov_results(cov_type, **kwds)
        assert_allclose(res1.params, res2.params, rtol=1e-13)
        assert_allclose(res1.cov_params(), res2.cov_params(), rtol=1e-13)
        assert_allclose(res1.bse, res2.bse, rtol=1e-13)
        assert_allclose(res1.pvalues, res2.pvalues, rtol=1e-12)

        tt = res2.t_test(np.eye(len(res2.params)),
                         cov_p=res2.normalized_cov_params)
        assert_allclose(res2.cov_params(), res2.normalized_cov_params,
                        rtol=1e-13)
        assert_allclose(res2.bse, tt.sd, rtol=1e-13)
        assert_allclose(res2.pvalues, tt.pvalue, rtol=1e-13)
        assert_allclose(res2.tvalues, tt.tvalue, rtol=1e-13)

        # using cov_type in fit
        mod = self.res_wls.model
        mod3 = WLS(mod.endog, mod.exog, weights=mod.weights)
        res3 = mod3.fit(cov_type=cov_type, cov_kwds=kwds)
        tt = res3.t_test(np.eye(len(res3.params)),
                         cov_p=res3.normalized_cov_params)
        assert_allclose(res3.cov_params(), res3.normalized_cov_params,
                        rtol=1e-13)
        assert_allclose(res3.bse, tt.sd, rtol=1e-13)
        assert_allclose(res3.pvalues, tt.pvalue, rtol=1e-13)
        assert_allclose(res3.tvalues, tt.tvalue, rtol=1e-13)
コード例 #2
0
    def test_regularized_weights(self):

        np.random.seed(1432)
        exog1 = np.random.normal(size=(100, 3))
        endog1 = exog1[:, 0] + exog1[:, 1] + np.random.normal(size=100)
        exog2 = np.random.normal(size=(100, 3))
        endog2 = exog2[:, 0] + exog2[:, 1] + np.random.normal(size=100)

        exog_a = np.vstack((exog1, exog1, exog2))
        endog_a = np.concatenate((endog1, endog1, endog2))

        # Should be equivalent to exog_a, endog_a.
        exog_b = np.vstack((exog1, exog2))
        endog_b = np.concatenate((endog1, endog2))
        wgts = np.ones(200)
        wgts[0:100] = 2
        sigma = np.diag(1/wgts)

        for L1_wt in 0, 0.5, 1:
            for alpha in 0, 1:
                mod1 = OLS(endog_a, exog_a)
                rslt1 = mod1.fit_regularized(L1_wt=L1_wt, alpha=alpha)

                mod2 = WLS(endog_b, exog_b, weights=wgts)
                rslt2 = mod2.fit_regularized(L1_wt=L1_wt, alpha=alpha)

                mod3 = GLS(endog_b, exog_b, sigma=sigma)
                rslt3 = mod3.fit_regularized(L1_wt=L1_wt, alpha=alpha)

                assert_almost_equal(rslt1.params, rslt2.params, decimal=3)
                assert_almost_equal(rslt1.params, rslt3.params, decimal=3)
コード例 #3
0
    def setup_class(cls):
        nobs, k_exog = 100, 5
        np.random.seed(987125)
        x = np.random.randn(nobs, k_exog - 1)
        x = add_constant(x)
        cls.aweights = np.random.randint(1, 10, nobs)

        y_true = x.sum(1) / 2
        y = y_true + 2 * np.random.randn(nobs)
        cls.endog = y
        cls.exog = x
        cls.idx_uc = [0, 2, 3, 4]
        cls.idx_p_uc = np.array(cls.idx_uc)
        cls.idx_c = [1]
        cls.exogc = xc = x[:, cls.idx_uc]
        mod_ols_c = WLS(y - 0.5 * x[:, 1], xc, weights=cls.aweights)
        mod_ols_c.exog_names[:] = ['const', 'x2', 'x3', 'x4']
        cls.mod2 = mod_ols_c
        cls.init()
コード例 #4
0
    def setup_class(cls):

        # from example wls.py

        nsample = 50
        x = np.linspace(0, 20, nsample)
        X = np.column_stack((x, (x - 5)**2))
        from statsmodels.tools.tools import add_constant
        X = add_constant(X)
        beta = [5., 0.5, -0.01]
        sig = 0.5
        w = np.ones(nsample)
        w[int(nsample * 6. / 10):] = 3
        y_true = np.dot(X, beta)
        e = np.random.normal(size=nsample)
        y = y_true + sig * w * e
        X = X[:,[0,1]]


        # ### WLS knowing the true variance ratio of heteroscedasticity

        mod_wls = WLS(y, X, weights=1./w)
        cls.res_wls = mod_wls.fit()
コード例 #5
0
error = y - y_pred
mse = (error * error).mean()
print(mse)
res_ols = OLS(y, exog[:, :3]).fit()
print(np.squeeze(pmod.coef) - res_ols.params)

weights = np.ones(nobs)
weights[:nobs // 3] = 0.1
weights[-nobs // 5:] = 2

pmodw = smoothers.PolySmoother(2, x)
pmodw.fit(y, weights=weights)  #no return
y_predw = pmodw.predict(x)
error = y - y_predw
mse = (error * error).mean()
print(mse)
res_wls = WLS(y, exog[:, :3], weights=weights).fit()
print(np.squeeze(pmodw.coef) - res_wls.params)

doplot = 1
if doplot:
    import matplotlib.pyplot as plt
    plt.plot(y, '.')
    plt.plot(y_true, 'b-', label='true')
    plt.plot(y_pred, '-', label='poly')
    plt.plot(y_predw, '-', label='poly -w')
    plt.legend(loc='upper left')

    plt.close()
    #plt.show()
コード例 #6
0
def test_predict_se():
    # this test doesn't use reference values
    # checks conistency across options, and compares to direct calculation

    # generate dataset
    nsample = 50
    x1 = np.linspace(0, 20, nsample)
    x = np.c_[x1, (x1 - 5)**2, np.ones(nsample)]
    np.random.seed(0)#9876789) #9876543)
    beta = [0.5, -0.01, 5.]
    y_true2 = np.dot(x, beta)
    w = np.ones(nsample)
    w[int(nsample * 6. / 10):] = 3
    sig = 0.5
    y2 = y_true2 + sig * w * np.random.normal(size=nsample)
    x2 = x[:,[0,2]]

    # estimate OLS
    res2 = OLS(y2, x2).fit()

    #direct calculation
    covb = res2.cov_params()
    predvar = res2.mse_resid + (x2 * np.dot(covb, x2.T).T).sum(1)
    predstd = np.sqrt(predvar)

    prstd, iv_l, iv_u = wls_prediction_std(res2)
    np.testing.assert_almost_equal(prstd, predstd, 15)

    #stats.t.isf(0.05/2., 50 - 2)
    q = 2.0106347546964458
    ci_half = q * predstd
    np.testing.assert_allclose(iv_u, res2.fittedvalues + ci_half, rtol=1e-12)
    np.testing.assert_allclose(iv_l, res2.fittedvalues - ci_half, rtol=1e-12)

    prstd, iv_l, iv_u = wls_prediction_std(res2, x2[:3,:])
    np.testing.assert_equal(prstd, prstd[:3])
    np.testing.assert_allclose(iv_u, res2.fittedvalues[:3] + ci_half[:3],
                               rtol=1e-12)
    np.testing.assert_allclose(iv_l, res2.fittedvalues[:3] - ci_half[:3],
                               rtol=1e-12)


    # check WLS
    res3 = WLS(y2, x2, 1. / w).fit()

    #direct calculation
    covb = res3.cov_params()
    predvar = res3.mse_resid * w + (x2 * np.dot(covb, x2.T).T).sum(1)
    predstd = np.sqrt(predvar)

    prstd, iv_l, iv_u = wls_prediction_std(res3)
    np.testing.assert_almost_equal(prstd, predstd, 15)

    #stats.t.isf(0.05/2., 50 - 2)
    q = 2.0106347546964458
    ci_half = q * predstd
    np.testing.assert_allclose(iv_u, res3.fittedvalues + ci_half, rtol=1e-12)
    np.testing.assert_allclose(iv_l, res3.fittedvalues - ci_half, rtol=1e-12)

    # testing shapes of exog
    prstd, iv_l, iv_u = wls_prediction_std(res3, x2[-1:,:], weights=3.)
    np.testing.assert_equal(prstd, prstd[-1])
    prstd, iv_l, iv_u = wls_prediction_std(res3, x2[-1,:], weights=3.)
    np.testing.assert_equal(prstd, prstd[-1])

    prstd, iv_l, iv_u = wls_prediction_std(res3, x2[-2:,:], weights=3.)
    np.testing.assert_equal(prstd, prstd[-2:])

    prstd, iv_l, iv_u = wls_prediction_std(res3, x2[-2:,:], weights=[3, 3])
    np.testing.assert_equal(prstd, prstd[-2:])

    prstd, iv_l, iv_u = wls_prediction_std(res3, x2[:3,:])
    np.testing.assert_equal(prstd, prstd[:3])
    np.testing.assert_allclose(iv_u, res3.fittedvalues[:3] + ci_half[:3],
                               rtol=1e-12)
    np.testing.assert_allclose(iv_l, res3.fittedvalues[:3] - ci_half[:3],
                               rtol=1e-12)


    #use wrong size for exog
    #prstd, iv_l, iv_u = wls_prediction_std(res3, x2[-1,0], weights=3.)
    np.testing.assert_raises(ValueError, wls_prediction_std, res3, x2[-1,0],
                             weights=3.)

    # check some weight values
    sew1 = wls_prediction_std(res3, x2[-3:,:])[0]**2
    for wv in np.linspace(0.5, 3, 5):

        sew = wls_prediction_std(res3, x2[-3:,:], weights=1. / wv)[0]**2
        np.testing.assert_allclose(sew, sew1 + res3.scale * (wv - 1))
コード例 #7
0
ファイル: onewaygls.py プロジェクト: arnab0000/Internships
    def fitjoint(self):
        '''fit a joint fixed effects model to all observations

        The regression results are attached as `lsjoint`.

        The contrasts for overall and pairwise tests for equality of coefficients are
        attached as a dictionary `contrasts`. This also includes the contrasts for the test
        that the coefficients of a level are zero. ::

        >>> res.contrasts.keys()
        [(0, 1), 1, 'all', 3, (1, 2), 2, (1, 3), (2, 3), (0, 3), (0, 2)]

        The keys are based on the original names or labels of the groups.

        TODO: keys can be numpy scalars and then the keys cannot be sorted



        '''
        if not hasattr(self, 'weights'):
            self.fitbygroups()
        groupdummy = (self.groupsint[:, None] == self.uniqueint).astype(int)
        #order of dummy variables by variable - not used
        #dummyexog = self.exog[:,:,None]*groupdummy[:,None,1:]
        #order of dummy variables by grous - used
        dummyexog = self.exog[:, None, :] * groupdummy[:, 1:, None]
        exog = np.c_[self.exog,
                     dummyexog.reshape(self.exog.shape[0], -1)]  #self.nobs ??
        #Notes: I changed to drop first group from dummy
        #instead I want one full set dummies
        if self.het:
            weights = self.weights
            res = WLS(self.endog, exog, weights=weights).fit()
        else:
            res = OLS(self.endog, exog).fit()
        self.lsjoint = res
        contrasts = {}
        nvars = self.exog.shape[1]
        nparams = exog.shape[1]
        ndummies = nparams - nvars
        contrasts['all'] = np.c_[np.zeros((ndummies, nvars)), np.eye(ndummies)]
        for groupind, group in enumerate(
                self.unique[1:]):  #need enumerate if groups != groupsint
            groupind = groupind + 1
            contr = np.zeros((nvars, nparams))
            contr[:, nvars * groupind:nvars * (groupind + 1)] = np.eye(nvars)
            contrasts[group] = contr
            #save also for pairs, see next
            contrasts[(self.unique[0], group)] = contr

        #Note: I'm keeping some duplication for testing
        pairs = np.triu_indices(len(self.unique), 1)
        for ind1, ind2 in zip(
                *pairs):  #replace with group1, group2 in sorted(keys)
            if ind1 == 0:
                continue  # need comparison with benchmark/normalization group separate
            g1 = self.unique[ind1]
            g2 = self.unique[ind2]
            group = (g1, g2)
            contr = np.zeros((nvars, nparams))
            contr[:, nvars * ind1:nvars * (ind1 + 1)] = np.eye(nvars)
            contr[:, nvars * ind2:nvars * (ind2 + 1)] = -np.eye(nvars)
            contrasts[group] = contr

        self.contrasts = contrasts
コード例 #8
0
 def _engine_factory(self, fy, X, check_integrity=True):
     if self.use_weighted_fit:
         return WLS(fy, X, weights=self._get_weights())
     else:
         return OLS(fy, X)
コード例 #9
0
def gates(y, d, prop, s_hat, q=10, print_table=True):
    """Calculate Group Average Treatment Effect

    Parameters
    ----------
    y : ndarray
        vector of outcomes
    d : ndarray
        treatment indicator
    prop : ndarray
        treatment propensity
    s_hat : ndarray
        estimated treatment effect
    q : int, optional
        number of groups, by default 10
    print_table : bool, optional
        toggle results table, by default True

    Returns
    -------
    dict
        results with baseline and treatment effect for each group
    """

    # Define groups
    bin_indices, bin_edges, bin_pct = quantile_grid(
        x=s_hat + 1e-16 * np.random.uniform(size=len(s_hat)), q=q  # Break ties
    )

    # Dummy coding
    s_onehot = np.zeros((len(s_hat), len(bin_edges)))
    s_onehot[np.arange(0, len(s_hat)), bin_indices] = 1

    # Calculate model matrix
    x_reg = np.column_stack(
        (s_onehot, s_onehot * np.reshape(d - prop, newshape=(-1, 1)))
    )
    w_reg = (prop * (1 - prop)) ** (-1)  # weights
    y_reg = y

    # Run weighted least squares
    labels_baseline = [
        f"Baseline: p={p / 100:.2f} ({x:.2f})"
        for p, x in zip(bin_pct.tolist(), bin_edges.tolist())
    ]
    labels_treatment = [
        f"Treatment: p={p / 100:.2f} ({x:.2f})"
        for p, x in zip(bin_pct.tolist(), bin_edges.tolist())
    ]
    labels = labels_baseline + labels_treatment

    wls = WLS(endog=y_reg, exog=x_reg, w=w_reg)
    wls = wls.fit()

    if print_table:
        print(wls.summary(xname=labels))

    return {
        "coef_baseline": wls.params[: len(labels_baseline)],
        "coef_treatment": wls.params[len(labels_baseline) :],
        "bin_values": bin_edges,
        "bin_count": np.sum(s_onehot, axis=0),
    }
コード例 #10
0
 def setup_class(cls):
     from statsmodels.datasets.ccard import load
     data = load(as_pandas=False)
     cls.res1 = WLS(data.endog, data.exog,
                    weights=1 / data.exog[:, 2]).fit()
     cls.res2 = GLS(data.endog, data.exog, sigma=data.exog[:, 2]).fit()
コード例 #11
0
 def setup_class(cls):
     cls.exog = np.ones((1, ))
     cls.endog = np.ones((1, ))
     weights = 1
     cls.wls_res = WLS(cls.endog, cls.exog, weights=weights).fit()
コード例 #12
0
    def mr_presso(self, n_sims=1000, significance_thresh=0.05):
        """
        Python reimplementation of MR-PRESSO.

        :param n_sims: number of permutation simulations.
        :param significance_thresh: significance thresshold.
        :return: beta, se and p value of the estimate after the bad snps were removed. If no estimate can be made,
        returns a tuple of 3* (np.nan).
        """
        def make_random_data():
            beta_ivw, _, _ = self.do_ivw_estimation()
            random_exposure = np.random.normal(
                [x[0] for x in self.exposure_tuples],
                [x[1] for x in self.exposure_tuples])
            random_outcome = np.random.normal(
                [beta_ivw * x[0] for x in self.exposure_tuples],
                [x[1] for x in self.outcome_tuples])

            mr_estimates = np.zeros((len(random_outcome), 3), dtype=float)
            for i in range(len(random_outcome)):
                mr_estimates[i, :] = self.do_single_term_mr_estimate(
                    (random_exposure[i], self.exposure_tuples[i][1]),
                    (random_outcome[i], self.outcome_tuples[i][1]))

            return random_exposure, random_outcome, mr_estimates

        def leave_one_out_residual_sum_of_squares(estimation_data,
                                                  weighted_outcome,
                                                  weighted_exposure):

            estimation_data = np.asarray(estimation_data)
            leave_one_out_ivw = np.zeros(shape=(len(estimation_data), 3),
                                         dtype=float)
            for i in range(len(estimation_data)):
                leave_one_out_ivw[
                    i, :] = self.do_ivw_estimation_on_estimate_vector(
                        np.delete(estimation_data, i, 0))

            rss = (weighted_outcome -
                   leave_one_out_ivw[:, 0] * weighted_exposure)**2

            return rss, leave_one_out_ivw

        def make_random_data_and_return_rss(weights):
            exposure, outcome, mr_estimates = make_random_data()

            weighted_exposure = exposure * weights
            weighted_outcome = outcome * weights

            rss, _ = leave_one_out_residual_sum_of_squares(
                mr_estimates, weighted_outcome, weighted_exposure)

            return np.sum(rss), np.concatenate((exposure.reshape(
                len(exposure), 1), outcome.reshape(len(outcome), 1)),
                                               axis=1)

        def randomly_sample_distortion(outlier_indices):
            estimates = np.asarray(self.estimation_data)
            estimates_no_outliers = np.delete(estimates,
                                              outlier_indices,
                                              axis=0)
            estimates_only_outliers = estimates[outlier_indices, :][0]

            indices_sampled_from_no_outliers = np.random.choice(
                estimates_no_outliers.shape[0],
                size=estimates_no_outliers.shape[0],
                replace=True)
            return self.do_ivw_estimation_on_estimate_vector(
                np.concatenate((
                    estimates_no_outliers[indices_sampled_from_no_outliers, :],
                    estimates_only_outliers)))

        # runtime checks.
        num_estimates = len(self.estimation_data)

        if num_estimates < 3:
            raise ValueError(
                "Only {} estimates supplied, need at least three to find simulate_mr presso outliers"
                .format(num_estimates))

        if len(self.exposure_tuples) != num_estimates:
            raise ValueError(
                "No exposure sumstats present, cannot do mr_presso outlier.")

        if len(self.outcome_tuples) != num_estimates:
            raise ValueError(
                "No outcome sumstats present, cannot do mr_presso outlier.")

        # this is just following MR presso.
        outcome = np.asarray(self.outcome_tuples, dtype=float)
        exposure = np.asarray(self.exposure_tuples, dtype=float)
        weighted_outcome = np.asarray(
            [x[0] / np.sqrt(x[1]**2) for x in self.outcome_tuples],
            dtype=float)
        weighted_exposure = np.asarray([
            self.exposure_tuples[i][0] / np.sqrt(self.outcome_tuples[i][1]**2)
            for i in range(len(self.exposure_tuples))
        ],
                                       dtype=float)
        weights = np.asarray(
            [1 / np.sqrt(x[1]**2) for x in self.outcome_tuples], dtype=float)

        rss, list_of_assocs = leave_one_out_residual_sum_of_squares(
            self.estimation_data, weighted_outcome, weighted_exposure)

        expected_results = [
            make_random_data_and_return_rss(weights) for _ in range(n_sims)
        ]

        sim_rss = [x[0] for x in expected_results]

        global_p_val = sum(sim_rss > sum(rss)) / n_sims
        local_p_val = None
        if global_p_val < significance_thresh:
            expected_betas = np.zeros((num_estimates, n_sims, 2), dtype=float)
            for i in range(n_sims):
                expected_betas[:, i] = expected_results[i][1]

            difference = outcome[:, 0] - exposure[:, 0] * list_of_assocs[:, 0]
            expected_difference = expected_betas[:, :,
                                                 1] - expected_betas[:, :, 0] * np.tile(
                                                     list_of_assocs[:, 0],
                                                     (n_sims, 1)).transpose()
            local_p_val = np.sum(expected_difference**2 >
                                 (difference**2).reshape((len(difference), 1)),
                                 axis=1) / n_sims
            local_p_val = np.asarray([
                x * len(difference) if x * len(difference) < 1.0 else 1.0
                for x in local_p_val
            ])

        # distortion test.
        outlier_corrected_ivw_result = (np.nan, np.nan, np.nan)
        if local_p_val is not None and sum(local_p_val < significance_thresh):
            outliers = local_p_val < significance_thresh

            exposure_betas = [
                self.exposure_tuples[i][0]
                for i in range(len(self.estimation_data)) if not outliers[i]
            ]
            outcome_betas = [
                self.outcome_tuples[i][0]
                for i in range(len(self.estimation_data)) if not outliers[i]
            ]
            weights = [
                1 / self.outcome_tuples[i][1]**2
                for i in range(len(self.estimation_data)) if not outliers[i]
            ]

            outlier_corrected_ivw_result = WLS(exog=exposure_betas,
                                               endog=outcome_betas,
                                               weights=weights).fit()

            return outlier_corrected_ivw_result.params[
                0], outlier_corrected_ivw_result.bse[
                    0], outlier_corrected_ivw_result.pvalues[0]
        else:
            return outlier_corrected_ivw_result
コード例 #13
0
class LinearRegression(object):
    '''
	Patsy wrapper for linear estimation and prediction.

	Uses statsmodels WLS to allow weights.
	If no weights are provided, results are equivalent to OLS.
	'''
    def __init__(self, formula=None, data=None, **kwargs):

        # convert all variables raised to a power to float64
        # this prevents mis-specification of probabilities in cases of variable overflow
        # (if the original var was compressed to a smaller bit integer/float)
        if type(data) == pd.DataFrame:
            power_vars = list(set(re.findall(r'(?<=power\().+?(?=,)',
                                             formula)))
            for var in power_vars:
                data[var] = data[var].astype('float64')

        if formula:
            y, X = patsy.dmatrices(formula, data, 1)

            self._y_design_info = y.design_info
            self._X_design_info = X.design_info

            self._model = WLS(y, X, **kwargs)
            self._fit = self._model.fit()
            self._betas = self._fit.params
            self._std = np.std(data[self._model.data.ynames].values -
                               self.predict(data))
            self._r2 = self._fit.rsquared
            self._r2_adj = self._fit.rsquared_adj
        else:
            self._y_design_info = None
            self._X_design_info = None
            self._model = None
            self._fit = None
            self._betas = None
            self._std = None
            self._r2 = None
            self._r2_adj = None

    def __repr__(self):
        return str(self._fit.summary())

    def predict(self, data):
        '''
		Returns fitted values for the data provided.
		'''

        if len(data) == 0:
            return []

        # identifies exponential variables from the design matrix (via the 'power' flag) and converts to float64
        # this prevents mis-specification of probabilities in cases of variable overflow
        # (if the original var was compressed to a smaller bit integer/float)
        power_vars = list(set([
         re.search(r'(?<=power\().+?(?=,)', column).group() for column in \
         self._X_design_info.column_names if 'power' in column
        ]))
        for var in power_vars:
            data[var] = data[var].astype('float64')

        (X, ) = patsy.build_design_matrices([self._X_design_info], data)

        return linear_transform(np.asarray(X), self._betas)

    def residuals(self, data):
        '''
		Returns residuals from fitting the model to the data provided.
		'''

        if len(data) == 0:
            return []

        return data[self._model.data.ynames].values - self.predict(data)

    def draw(self, data, rand_engine):
        '''
		Returns fitted values for the data provided plus a random draw
		from a normal distribution with the regression standard error.
		'''

        return self.predict(data) + rand_engine.normal(0, self._std, len(data))

    def Rsquared(self, adjusted=True):
        '''
		Returns the model's adjusted R squared.
		To return unadjusted R squared, pass adjusted=False.
		'''

        if adjusted:
            return self._r2_adj
        else:
            return self._r2

    def to_pickle(self, filename):
        '''
		Writes basic model information to a pickle file.
		'''

        pickle.dump((self._y_design_info, self._X_design_info, self._betas,
                     self._std, self._r2, self._r2_adj), open(filename, "wb"))

    @staticmethod
    def read_pickle(filename):
        '''
		Reads basic model information from a pickle file.

		Returns a LinearRegression object that does not include the model 
		summary or fit object but can execute all class functions.
		'''

        y_design_info, X_design_info, betas, std, r2, r2_adj = pickle.load(
            open(filename, "rb"))

        linear_regression = LinearRegression()
        linear_regression._y_design_info = y_design_info
        linear_regression._X_design_info = X_design_info
        linear_regression._betas = betas
        linear_regression._std = std
        linear_regression._r2 = r2
        linear_regression._r2_adj = r2_adj

        return linear_regression

    def __add__(self, other):
        ret = copy(self)
        ret._betas = self._betas + other._betas
        return ret

    def __sub__(self, other):
        ret = copy(self)
        ret._betas = self._betas - other._betas
        return ret

    def __mul__(self, other):
        ret = copy(self)
        ret._betas = ret._betas * other
        return ret
コード例 #14
0
ファイル: flux_regressor.py プロジェクト: waffle-iron/pychron
 def _engine_factory(self, fy, X):
     if self.use_weighted_fit:
         return WLS(fy, X, weights=self._get_weights())
     else:
         return OLS(fy, X)
コード例 #15
0
ファイル: shared.py プロジェクト: snayeri/arch
def fit_pval_model(
    quantiles: pd.DataFrame,
    small_order: int = 3,
    use_log: bool = False,
    drop_insignif: bool = True,
) -> PvalueResult:
    if small_order not in (3, 4):
        raise ValueError("Small order must be 3 or 4")
    quantiles = quantiles.sort_index(ascending=False)
    percentiles = quantiles.index.to_numpy()
    lhs = stats.norm.ppf(percentiles)
    data = np.asarray(quantiles)
    avg_test_stats = data.mean(1)
    avg_test_std = data.std(1)
    avg_test_stats = avg_test_stats[:, None]

    rhs = avg_test_stats**np.arange(4)
    rhs_large = rhs
    rhs_log = np.log(np.abs(avg_test_stats))**np.arange(4)
    lhs_large = lhs
    res_large = WLS(lhs_large, rhs, weights=1.0 / avg_test_std).fit()
    temp = res_large.params.copy()
    if drop_insignif:
        temp[res_large.pvalues > 0.05] = 0.0
    large_p = temp

    # Compute tau_max, by finding the func maximum
    p = res_large.params
    poly_roots = np.roots(np.array([3, 2, 1.0]) * p[:0:-1])
    if np.isreal(poly_roots[0]):
        tau_max = float(np.squeeze(np.real(np.max(poly_roots))))
    else:
        tau_max = np.inf

    # Small p regression using only p<=15%
    cutoff = np.where(percentiles <= 0.150)[0]
    lhs_small = lhs[cutoff]
    if use_log:
        avg_test_stats = np.log(np.abs(avg_test_stats[cutoff]))
        avg_test_std = np.log(np.abs(data[cutoff])).std(1)
        assert np.all(np.isfinite(avg_test_std))
        rhs = avg_test_stats**np.arange(small_order)
    else:
        avg_test_stats = avg_test_stats[cutoff]
        avg_test_std = avg_test_std[cutoff]
        rhs = avg_test_stats**np.arange(small_order)

    res_small = WLS(lhs_small, rhs, weights=1.0 / avg_test_std).fit()
    temp = res_small.params
    if drop_insignif:
        temp[res_small.pvalues > 0.05] = 0.0
    small_p = temp

    # Compute tau star
    err_large = lhs_large - rhs_large.dot(large_p)
    params = small_p.copy()
    if small_order == 3:
        # Missing 1 parameter here, replace with 0
        params = np.append(params, 0.0)
    if use_log:
        pred_small = rhs_log.dot(params)
    else:
        pred_small = rhs_large.dot(params)
    err_small = lhs_large - pred_small
    # Find the location that minimizes the total absolute error
    m = lhs_large.shape[0]
    abs_err = np.zeros((m, 1))
    for j in range(m):
        abs_err[j] = np.abs(err_large[:j]).sum() + np.abs(err_small[j:]).sum()
    loc = np.argmin(abs_err)
    tau_star = rhs_large[loc, 1]
    if use_log:
        assert tau_star < 0
    # Compute tau min
    tau_min = -params[1] / (2 * params[2])
    if use_log:
        assert small_order == 4
        assert params[2] * params[3] < 0
        tau_min = -np.inf
    large_p = [round(val, 5) for val in large_p]
    small_p = [round(val, 5) for val in small_p]
    tau_max = round(tau_max, 5)
    tau_star = round(tau_star, 5)
    tau_min = round(tau_min, 5)
    return PvalueResult(large_p, small_p, tau_max, tau_star, tau_min)
コード例 #16
0
class LinearRegression(object):
    """Patsy wrapper for linear estimation and prediction.
    """

    def __init__(self, formula=None, data=None, **kwargs):

        if formula:
            y, X = patsy.dmatrices(formula, data, 1)

            self._y_design_info = y.design_info
            self._X_design_info = X.design_info

            self._model = WLS(y, X, **kwargs)
            self._fit = self._model.fit()
            self._betas = self._fit.params
            self._std = numpy.std(data[self._model.data.ynames].values - self.predict(data))
        else:
            self._y_design_info = None
            self._X_design_info = None
            self._model = None
            self._fit = None
            self._betas = None
            self._std = None

    def __repr__(self):
        return str(self._fit.summary())

    def predict(self, data):

        if len(data) == 0:
            return []

        (X, ) = patsy.build_design_matrices([self._X_design_info], data)

        return linear_transform(numpy.asarray(X), self._betas)

    def draw(self, data, rand_engine):

        return self.predict(data) + rand_engine.normal(0, self._std, len(data))

    def to_pickle(self, filename):

        pickle.dump((self._y_design_info, self._X_design_info, self._betas, self._std),
                    open(filename, "wb"))

    @staticmethod
    def read_pickle(filename):
        y_design_info, X_design_info, betas, std = pickle.load(open(filename, "rb"))

        linear_regression = LinearRegression()
        linear_regression._y_design_info = y_design_info
        linear_regression._X_design_info = X_design_info
        linear_regression._betas = betas
        linear_regression._std = std

        return linear_regression

    def __add__(self, other):
        ret = copy(self)
        ret._betas = self._betas + other._betas
        return ret

    def __sub__(self, other):
        ret = copy(self)
        ret._betas = self._betas - other._betas
        return ret

    def __mul__(self, other):
        ret = copy(self)
        ret._betas = ret._betas * other
        return ret
コード例 #17
0
X = np.column_stack((x, (x - 5)**2))
from statsmodels.tools.tools import add_constant
X = add_constant(X)
beta = [5., 0.5, -0.01]
sig = 0.5
w = np.ones(nsample)
w[nsample * 6/10:] = 3
y_true = np.dot(X, beta)
e = np.random.normal(size=nsample)
y = y_true + sig * w * e
X = X[:,[0,1]]


# ### WLS knowing the true variance ratio of heteroscedasticity

mod_wls = WLS(y, X, weights=1./w)
res_wls = mod_wls.fit()



prstd, iv_l, iv_u = wls_prediction_std(res_wls)
pred_res = get_prediction(res_wls)
ci = pred_res.conf_int(obs=True)

from numpy.testing import assert_allclose
assert_allclose(pred_res.se_obs, prstd, rtol=1e-13)
assert_allclose(ci, np.column_stack((iv_l, iv_u)), rtol=1e-13)

print(pred_res.summary_frame().head())

pred_res2 = res_wls.get_prediction()
def test_predict_se():
    # this test doesn't use reference values
    # checks conistency across options, and compares to direct calculation

    # generate dataset
    nsample = 50
    x1 = np.linspace(0, 20, nsample)
    x = np.c_[x1, (x1 - 5)**2, np.ones(nsample)]
    np.random.seed(0)  #9876789) #9876543)
    beta = [0.5, -0.01, 5.]
    y_true2 = np.dot(x, beta)
    w = np.ones(nsample)
    w[int(nsample * 6. / 10):] = 3
    sig = 0.5
    y2 = y_true2 + sig * w * np.random.normal(size=nsample)
    x2 = x[:, [0, 2]]

    # estimate OLS
    res2 = OLS(y2, x2).fit()

    #direct calculation
    covb = res2.cov_params()
    predvar = res2.mse_resid + (x2 * np.dot(covb, x2.T).T).sum(1)
    predstd = np.sqrt(predvar)

    prstd, iv_l, iv_u = wls_prediction_std(res2)
    np.testing.assert_almost_equal(prstd, predstd, 15)

    #stats.t.isf(0.05/2., 50 - 2)
    q = 2.0106347546964458
    ci_half = q * predstd
    np.testing.assert_allclose(iv_u, res2.fittedvalues + ci_half, rtol=1e-12)
    np.testing.assert_allclose(iv_l, res2.fittedvalues - ci_half, rtol=1e-12)

    prstd, iv_l, iv_u = wls_prediction_std(res2, x2[:3, :])
    np.testing.assert_equal(prstd, prstd[:3])
    np.testing.assert_allclose(iv_u,
                               res2.fittedvalues[:3] + ci_half[:3],
                               rtol=1e-12)
    np.testing.assert_allclose(iv_l,
                               res2.fittedvalues[:3] - ci_half[:3],
                               rtol=1e-12)

    # check WLS
    res3 = WLS(y2, x2, 1. / w).fit()

    #direct calculation
    covb = res3.cov_params()
    predvar = res3.mse_resid * w + (x2 * np.dot(covb, x2.T).T).sum(1)
    predstd = np.sqrt(predvar)

    prstd, iv_l, iv_u = wls_prediction_std(res3)
    np.testing.assert_almost_equal(prstd, predstd, 15)

    #stats.t.isf(0.05/2., 50 - 2)
    q = 2.0106347546964458
    ci_half = q * predstd
    np.testing.assert_allclose(iv_u, res3.fittedvalues + ci_half, rtol=1e-12)
    np.testing.assert_allclose(iv_l, res3.fittedvalues - ci_half, rtol=1e-12)

    # testing shapes of exog
    prstd, iv_l, iv_u = wls_prediction_std(res3, x2[-1:, :], weights=3.)
    np.testing.assert_equal(prstd, prstd[-1])
    prstd, iv_l, iv_u = wls_prediction_std(res3, x2[-1, :], weights=3.)
    np.testing.assert_equal(prstd, prstd[-1])

    prstd, iv_l, iv_u = wls_prediction_std(res3, x2[-2:, :], weights=3.)
    np.testing.assert_equal(prstd, prstd[-2:])

    prstd, iv_l, iv_u = wls_prediction_std(res3, x2[-2:, :], weights=[3, 3])
    np.testing.assert_equal(prstd, prstd[-2:])

    prstd, iv_l, iv_u = wls_prediction_std(res3, x2[:3, :])
    np.testing.assert_equal(prstd, prstd[:3])
    np.testing.assert_allclose(iv_u,
                               res3.fittedvalues[:3] + ci_half[:3],
                               rtol=1e-12)
    np.testing.assert_allclose(iv_l,
                               res3.fittedvalues[:3] - ci_half[:3],
                               rtol=1e-12)

    #use wrong size for exog
    #prstd, iv_l, iv_u = wls_prediction_std(res3, x2[-1,0], weights=3.)
    np.testing.assert_raises(ValueError,
                             wls_prediction_std,
                             res3,
                             x2[-1, 0],
                             weights=3.)

    # check some weight values
    sew1 = wls_prediction_std(res3, x2[-3:, :])[0]**2
    for wv in np.linspace(0.5, 3, 5):

        sew = wls_prediction_std(res3, x2[-3:, :], weights=1. / wv)[0]**2
        np.testing.assert_allclose(sew, sew1 + res3.scale * (wv - 1))
コード例 #19
0
    def do_egger_regression_two_variance_term(self):
        """
        Does egger regression based on two variance term estimates.

        :return: list of length two each with a tuple of floats: beta, se, wald_p_val of the estimate for intercept
        and slope respectively.
        """

        num_estimates = len(self.estimation_data)

        # runtime checks.

        if num_estimates < 3:
            raise ValueError(
                "Only {} estimates supplied, need at least three to estimate egger"
                .format(num_estimates))

        if len(self.exposure_tuples) != num_estimates:
            raise ValueError(
                "No exposure data present, cannot do Egger regression.")

        if len(self.outcome_tuples) != num_estimates:
            raise ValueError(
                "No outcome data present, cannot do Egger regression.")
        """
        Now turn exposure into positive values.
        """

        outcome_tuples = copy.deepcopy(self.outcome_tuples)
        exposure_tuples = copy.deepcopy(self.exposure_tuples)

        for i in range(num_estimates):
            if exposure_tuples[i][0] < 0:
                # flip.
                exposure_tuples[i] = (-1 * exposure_tuples[i][0],
                                      exposure_tuples[i][0])
                outcome_tuples[i] = (-1 * outcome_tuples[i][0],
                                     outcome_tuples[i][0])

        x_dat = np.asarray([x[0] for x in exposure_tuples])
        x_dat = add_constant(x_dat)

        y_dat = np.asarray([x[0] for x in outcome_tuples])

        #if this value is zero, we add the smallest possible constant, so it can still be used as weights.
        #checked with the 2015 paper introducing MR-Egger, and it works as expected.

        w_dat = np.zeros(len(self.estimation_data))
        for i in range(len(self.estimation_data)):
            w_dat[i] = outcome_tuples[i][0] ** -2  / \
                       ( (outcome_tuples[i][0]**-2 * outcome_tuples[i][1] ** 2) +
                         (exposure_tuples[i][0]**-2 * exposure_tuples[i][1] ** 2)
                    )

        wls_model = WLS(y_dat, x_dat, weights=w_dat)
        results = wls_model.fit()

        self.egger_intercept = (results.params[0], results.bse[0],
                                results.pvalues[0])
        self.egger_slope = (results.params[1], results.bse[1],
                            results.pvalues[1])

        self.egger_done = True

        return self.egger_intercept, self.egger_slope
コード例 #20
0
X = np.column_stack((x, (x - 5)**2))
from statsmodels.tools.tools import add_constant

X = add_constant(X)
beta = [5., 0.5, -0.01]
sig = 0.5
w = np.ones(nsample)
w[nsample * 6 / 10:] = 3
y_true = np.dot(X, beta)
e = np.random.normal(size=nsample)
y = y_true + sig * w * e
X = X[:, [0, 1]]

# ### WLS knowing the true variance ratio of heteroscedasticity

mod_wls = WLS(y, X, weights=1. / w)
res_wls = mod_wls.fit()

prstd, iv_l, iv_u = wls_prediction_std(res_wls)
pred_res = get_prediction(res_wls)
ci = pred_res.conf_int(obs=True)

from numpy.testing import assert_allclose

assert_allclose(pred_res.se_obs, prstd, rtol=1e-13)
assert_allclose(ci, np.column_stack((iv_l, iv_u)), rtol=1e-13)

print pred_res.summary_frame().head()

pred_res2 = res_wls.get_prediction()
ci2 = pred_res2.conf_int(obs=True)
コード例 #21
0
ファイル: betareg.py プロジェクト: timgates42/statsmodels
    def _start_params(self, niter=2, return_intermediate=False):
        """find starting values

        Parameters
        ----------
        niter : int
            Number of iterations of WLS approximation
        return_intermediate : bool
            If False (default), then only the preliminary parameter estimate
            will be returned.
            If True, then also the two results instances of the WLS estimate
            for mean parameters and for the precision parameters will be
            returned.

        Returns
        -------
        sp : ndarray
            start parameters for the optimization
        res_m2 : results instance (optional)
            Results instance for the WLS regression of the mean function.
        res_p2 : results instance (optional)
            Results instance for the WLS regression of the precision function.

        Notes
        -----
        This calculates a few iteration of weighted least squares. This is not
        a full scoring algorithm.
        """
        # WLS of the mean equation uses the implied weights (inverse variance),
        # WLS for the precision equations uses weights that only take
        # account of the link transformation of the precision endog.
        from statsmodels.regression.linear_model import OLS, WLS
        res_m = OLS(self.link(self.endog), self.exog).fit()
        fitted = self.link.inverse(res_m.fittedvalues)
        resid = self.endog - fitted

        prec_i = fitted * (1 - fitted) / np.maximum(np.abs(resid), 1e-2)**2 - 1
        res_p = OLS(self.link_precision(prec_i), self.exog_precision).fit()
        prec_fitted = self.link_precision.inverse(res_p.fittedvalues)
        # sp = np.concatenate((res_m.params, res_p.params))

        for _ in range(niter):
            y_var_inv = (1 + prec_fitted) / (fitted * (1 - fitted))
            # y_var = fitted * (1 - fitted) / (1 + prec_fitted)

            ylink_var_inv = y_var_inv / self.link.deriv(fitted)**2
            res_m2 = WLS(self.link(self.endog),
                         self.exog,
                         weights=ylink_var_inv).fit()
            fitted = self.link.inverse(res_m2.fittedvalues)
            resid2 = self.endog - fitted

            prec_i2 = (fitted *
                       (1 - fitted) / np.maximum(np.abs(resid2), 1e-2)**2 - 1)
            w_p = 1. / self.link_precision.deriv(prec_fitted)**2
            res_p2 = WLS(self.link_precision(prec_i2),
                         self.exog_precision,
                         weights=w_p).fit()
            prec_fitted = self.link_precision.inverse(res_p2.fittedvalues)
            sp2 = np.concatenate((res_m2.params, res_p2.params))

        if return_intermediate:
            return sp2, res_m2, res_p2

        return sp2
コード例 #22
0
 def setup_class(cls):
     data = longley.load(as_pandas=False)
     data.exog = add_constant(data.exog, prepend=False)
     cls.res1 = OLS(data.endog, data.exog).fit()
     cls.res2 = WLS(data.endog, data.exog).fit()
コード例 #23
0
 def setupClass(cls):
     data = longley.load()
     data.exog = add_constant(data.exog)
     cls.res1 = OLS(data.endog, data.exog).fit()
     cls.res2 = WLS(data.endog, data.exog).fit()
コード例 #24
0
    percentiles = data['percentiles']
    results = data['results']  # Remove later
    # LHS is norm cdf inv of percentiles
    lhs = norm().ppf(percentiles / 100.0)
    lhs_large = lhs
    # RHS is made up of avg test stats for largest T, which is in pos 1
    avg_test_stats = results[:, 1, :].mean(axis=1)
    avg_test_std = results[:, 1, :].std(axis=1)
    avg_test_stats = avg_test_stats[:, None]
    m = lhs.shape[0]
    rhs = np.ones((m, 1))
    rhs = np.hstack((rhs, avg_test_stats))
    rhs = np.hstack((rhs, avg_test_stats**2.0))
    rhs = np.hstack((rhs, avg_test_stats**3.0))
    rhs_large = rhs
    res_large = WLS(lhs, rhs, weights=1.0 / avg_test_std).fit()
    dfgls_large_p[t] = res_large.params
    # Compute tau_max, by finding the func maximum
    p = res_large.params
    poly_roots = np.roots(np.array([3, 2, 1.0]) * p[:0:-1])
    dfgls_tau_max[t] = float(np.squeeze(np.real(np.max(poly_roots))))

    # Small p regression using only p<=15%
    cutoff = np.where(percentiles <= 15.0)[0]
    avg_test_stats = results[cutoff, 1, :].mean(axis=1)
    avg_test_std = results[cutoff, 1, :].std(axis=1)
    avg_test_stats = avg_test_stats[:, None]
    lhs = lhs[cutoff]
    m = lhs.shape[0]
    rhs = np.ones((m, 1))
    rhs = np.hstack((rhs, avg_test_stats))
コード例 #25
0
    def fit(self, x, y1, y2, cens, w, verbose=False):
        """
        Fit a maximum-likelihood Tobit regression
        :param x: Pandas DataFrame (n_samples, n_features): Data
        :param y: Pandas Series (n_samples,): Target
        :param cens: Pandas Series (n_samples,): -1 indicates left-censored samples, 0 for uncensored, 1 for right-censored
        :param verbose: boolean, show info from minimization
        :return:
        """
        x_copy = x.copy()
        if self.fit_intercept:
            x_copy = np.insert(x_copy, 0, 1, axis=1)
        else:
            x_copy = skl.scale(x_copy,
                               with_mean=True,
                               with_std=False,
                               copy=False)

##		qui gen double `z' = cond(`y1'<.&`y2'<.,(`y1'+`y2')/2, /*
##		*/		 cond(`y1'<.,`y1',`y2')) `moff' if `doit'
        y = []
        counts = cens.value_counts()
        for value in [-1, 0, 1]:
            if value in counts:
                if value == -1:
                    split = cens == value
                    y_l = np.squeeze(y2[split].values)
                    y.append(y_l)
                elif value == 1:
                    split = cens == value
                    y_r = np.squeeze(y1[split].values)
                    y.append(y_r)
                elif value == 0:
                    split = cens == value
                    y_int = np.squeeze(
                        (y1[split].values + y2[split].values) / 2)
                    y.append(y_int)

        y = np.concatenate(y, axis=0)
        init_reg = WLS(y, x_copy, weights=w).fit()
        b0 = init_reg.params
        print(b0)
        y_pred = init_reg.predict(x_copy)
        resid = y - y_pred
        resid_var = np.var(resid)
        s0 = np.sqrt(resid_var)
        params0 = np.append(b0, s0)
        xs, ys, ys1, ys2, ws = split_left_right_censored(
            x_copy, y1, y2, cens, w)
        result = minimize(lambda params: tobit_neg_log_likelihood(
            xs, ys, ys1, ys2, ws, params),
                          params0,
                          jac=None,
                          method='Powell',
                          tol=0.000001,
                          options={
                              'disp': verbose,
                              'maxiter': 10000000,
                              'fatol': 0.00000001
                          })

        if verbose:
            print(result)
#        self.ols_coef_ = b0[1:]
#        self.ols_intercept = b0[0]
        if self.fit_intercept:
            self.intercept_ = result.x[0]
            self.coef_ = result.x[1:-1]
        else:
            self.coef_ = result.x[:-1]
            self.intercept_ = 0
        self.sigma_ = result.x[-1]
        return self