def weight_plot(model_results: RegressionResultsWrapper, score='t'):
    summary = model_results.summary2().tables[1]
    summary['abs.t'] = summary[score].abs()
    summary = summary.sort_values('abs.t', ascending=True)
    fig, ax = plt.subplots(figsize=(12, 0.5 * len(summary)))
    sns.despine(fig, left=True, bottom=True)
    for i, (coef, row) in enumerate(summary.iterrows()):
        # plot the points
        ax.plot(row[['[0.025', 'Coef.', '0.975]']], [i, i, i],
                'ko-',
                ms=5.,
                lw=2.,
                markevery=[1])
        # add the vertical markers
        ax.vlines(row['[0.025'], i - 0.15, i + 0.15)
        ax.vlines(row['0.975]'], i - 0.15, i + 0.15)
        ax.annotate("%.2f" % row['Coef.'], (row['Coef.'], i),
                    xytext=(-6, 4),
                    textcoords='offset points')
    # add the horizontal lines
    ax.hlines(list(range(len(summary))), [ax.get_xlim()[0]] * len(summary),
              summary['[0.025'],
              colors='lightgray',
              linestyle='--')
    ax.xaxis.set_visible(False)

    # add the y labels
    ax.set_yticks(list(range(len(summary))))
    ax.set_yticklabels(summary.index)
    ax.vlines([0], -1, len(summary), colors='k', linestyle='--')
    ax.set_title("Weight plot", loc='left', size=18, pad=-20)
Beispiel #2
0
    def _package_attrs(self, attrs):
        # Sometimes features are retrieved from wrapper (stargazer does this),
        # other times from the actual result (statsmodels' summary_col does
        # this), so we'll have both.
        rres = RRegressionResults()

        # Use patsy to extract the target variable:
        fobj = ModelDesc.from_formula(self.formula)
        rres.target = fobj.lhs_termlist[0].name()
        rres.model = self

        # We need to hijack this rather than subclassing because stargazer does
        # not use "isinstance()" but "type()":
        wrap = RegressionResultsWrapper(rres)

        # All items except "params" are @cache_readonly and need first to be
        # deleted, and then redefined:
        for attr in attrs:
            if attr not in ('params', ):
                if hasattr(rres, attr):
                    delattr(rres, attr)
            setattr(rres, attr, attrs[attr])
            setattr(wrap, attr, attrs[attr])
            self._debug("Set {} to {}".format(attr, attrs[attr]))

        rres.__class__ = RegressionResults
        return wrap
def show_result(poly_data: pandas.DataFrame,
                regression_model: RegressionResultsWrapper,
                predict_model: pandas.DataFrame, y_param: str, degree: int):

    poly_features = [
        i for i in poly_data.columns.values if i.startswith('power_')
    ]

    # #7
    pred_model = regression_model.predict(
        sm.add_constant(predict_model[poly_features]))

    plot_polynomial(poly_data, pred_model, y_param)

    # #9
    print('-----coefficient of degree {deg}------'.format(deg=degree))

    print_coefficient(regression_model.params)
Beispiel #4
0
def plot_confidence_intervals(res: RegressionResultsWrapper) -> alt.Chart:
    """Returns a matplotlib axes containing a box and whisker
    Altair plot of regression coefficients' point estimates and
    confidence intervals.
    """
    alt.themes.register("streamlit", streamlit_theme)  # Enable custom theme
    alt.themes.enable("streamlit")
    conf_int = res.conf_int()  # 95% C.I.
    # Stack lower and upper columns
    conf_int = conf_int.stack()
    conf_int.name = "estimate"
    conf_int = pd.DataFrame(conf_int)
    conf_int = (conf_int.reset_index().rename(columns={
        'level_0': 'regressor',
        'level_1': 'interval'
    }))
    chart = alt.Chart(conf_int).mark_boxplot().encode(
        x='regressor:O', y='estimate:Q').properties(width=200, height=500)
    return chart
Beispiel #5
0
def expression_fields(
    xs: np.ndarray,
    ys: np.ndarray,
    results: regres,
    n_ticks: int = 400,
) -> Tuple[np.ndarray, np.ndarray, Tuple[int, int]]:

    mx = np.max((xs[:, 1]))
    mn = np.min(xs[:, 1])
    xx = np.linspace(mn, mx, n_ticks)
    mx = np.max((xs[:, 2]))
    mn = np.min(xs[:, 2])
    yy = np.linspace(mn, mx, n_ticks)
    X, Y = np.meshgrid(xx, yy)
    shape = X.shape
    Xf = X.flatten()
    Yf = Y.flatten()
    XY = np.hstack((np.ones(
        (Xf.shape[0], 1)), Xf[:, np.newaxis], Yf[:, np.newaxis]))
    Z = results.predict(XY)

    return (XY[:, 1::], Z, shape)
    def fit(self, q=.5, vcov='robust', kernel='epa', bandwidth='hsheather',
            max_iter=1000, p_tol=1e-6, **kwargs):
        """
        Solve by Iterative Weighted Least Squares

        Parameters
        ----------
        q : float
            Quantile must be between 0 and 1
        vcov : str, method used to calculate the variance-covariance matrix
            of the parameters. Default is ``robust``:

            - robust : heteroskedasticity robust standard errors (as suggested
              in Greene 6th edition)
            - iid : iid errors (as in Stata 12)

        kernel : str, kernel to use in the kernel density estimation for the
            asymptotic covariance matrix:

            - epa: Epanechnikov
            - cos: Cosine
            - gau: Gaussian
            - par: Parzene

        bandwidth : str, Bandwidth selection method in kernel density
            estimation for asymptotic covariance estimate (full
            references in QuantReg docstring):

            - hsheather: Hall-Sheather (1988)
            - bofinger: Bofinger (1975)
            - chamberlain: Chamberlain (1994)
        """

        if q < 0 or q > 1:
            raise Exception('p must be between 0 and 1')

        kern_names = ['biw', 'cos', 'epa', 'gau', 'par']
        if kernel not in kern_names:
            raise Exception("kernel must be one of " + ', '.join(kern_names))
        else:
            kernel = kernels[kernel]

        if bandwidth == 'hsheather':
            bandwidth = hall_sheather
        elif bandwidth == 'bofinger':
            bandwidth = bofinger
        elif bandwidth == 'chamberlain':
            bandwidth = chamberlain
        else:
            raise Exception("bandwidth must be in 'hsheather', 'bofinger', 'chamberlain'")

        endog = self.endog
        exog = self.exog
        nobs = self.nobs
        exog_rank = np.linalg.matrix_rank(self.exog)
        self.rank = exog_rank
        self.df_model = float(self.rank - self.k_constant)
        self.df_resid = self.nobs - self.rank
        n_iter = 0
        xstar = exog

        beta = np.ones(exog_rank)
        # TODO: better start, initial beta is used only for convergence check

        # Note the following does not work yet,
        # the iteration loop always starts with OLS as initial beta
        # if start_params is not None:
        #    if len(start_params) != rank:
        #       raise ValueError('start_params has wrong length')
        #       beta = start_params
        #    else:
        #       # start with OLS
        #       beta = np.dot(np.linalg.pinv(exog), endog)

        diff = 10
        cycle = False

        history = dict(params = [], mse=[])
        while n_iter < max_iter and diff > p_tol and not cycle:
            n_iter += 1
            beta0 = beta
            xtx = np.dot(xstar.T, exog)
            xty = np.dot(xstar.T, endog)
            beta = np.dot(pinv(xtx), xty)
            resid = endog - np.dot(exog, beta)

            mask = np.abs(resid) < .000001
            resid[mask] = ((resid[mask] >= 0) * 2 - 1) * .000001
            resid = np.where(resid < 0, q * resid, (1-q) * resid)
            resid = np.abs(resid)
            xstar = exog / resid[:, np.newaxis]
            diff = np.max(np.abs(beta - beta0))
            history['params'].append(beta)
            history['mse'].append(np.mean(resid*resid))

            if (n_iter >= 300) and (n_iter % 100 == 0):
                # check for convergence circle, should not happen
                for ii in range(2, 10):
                    if np.all(beta == history['params'][-ii]):
                        cycle = True
                        warnings.warn("Convergence cycle detected", ConvergenceWarning)
                        break

        if n_iter == max_iter:
            warnings.warn("Maximum number of iterations (" + str(max_iter) +
                          ") reached.", IterationLimitWarning)

        e = endog - np.dot(exog, beta)
        # Greene (2008, p.407) writes that Stata 6 uses this bandwidth:
        # h = 0.9 * np.std(e) / (nobs**0.2)
        # Instead, we calculate bandwidth as in Stata 12
        iqre = stats.scoreatpercentile(e, 75) - stats.scoreatpercentile(e, 25)
        h = bandwidth(nobs, q)
        h = min(np.std(endog),
                iqre / 1.34) * (norm.ppf(q + h) - norm.ppf(q - h))

        fhat0 = 1. / (nobs * h) * np.sum(kernel(e / h))

        if vcov == 'robust':
            d = np.where(e > 0, (q/fhat0)**2, ((1-q)/fhat0)**2)
            xtxi = pinv(np.dot(exog.T, exog))
            xtdx = np.dot(exog.T * d[np.newaxis, :], exog)
            vcov = chain_dot(xtxi, xtdx, xtxi)
        elif vcov == 'iid':
            vcov = (1. / fhat0)**2 * q * (1 - q) * pinv(np.dot(exog.T, exog))
        else:
            raise Exception("vcov must be 'robust' or 'iid'")

        lfit = QuantRegResults(self, beta, normalized_cov_params=vcov)

        lfit.q = q
        lfit.iterations = n_iter
        lfit.sparsity = 1. / fhat0
        lfit.bandwidth = h
        lfit.history = history

        return RegressionResultsWrapper(lfit)
def get_rss(model: RegressionResultsWrapper, data: list, input_model: list,
            param_name: str) -> float:
    prediction = model.predict(sm.add_constant(data[input_model]))
    residuals = data[param_name] - prediction
    rss = (residuals**2).sum()
    return rss