Ejemplo n.º 1
0
    def f_stat(self):
        """The F-statistic of the regression

        Returns
        ----------
        float
            F-statistic of beta coefficients using regressors.stats
        """
        return regressors_stats.f_stat(self.ols, self.X, self.y)
Ejemplo n.º 2
0
    def __init__(self, X, y, saved_reg=None):
        """
        :param X: independent data
        :type X: np.array
        :param y: dependent data
        :type y: list
        """

        if saved_reg is None:
            self.reg = linear_model.LinearRegression()
            self.ols = self.reg.fit(X, y)
        else:
            self.reg = saved_reg.reg
            self.ols = saved_reg.ols

        self.y_intercept = self.reg.intercept_
        self.slope = self.reg.coef_
        params = np.append(self.y_intercept, self.slope)
        self.predictions = self.reg.predict(X)

        self.r_sq = r2_score(y, self.predictions)
        self.mse = mean_squared_error(y, self.predictions)

        self.p_values, self.sd_b, self.ts_b = get_p_values(
            X, y, self.predictions, params)

        self.residuals = np.subtract(y, self.predictions)

        self.norm_prob_plot = scipy_stats.probplot(self.residuals,
                                                   dist='norm',
                                                   fit=False,
                                                   plot=None,
                                                   rvalue=False)

        reg_prob = linear_model.LinearRegression()
        reg_prob.fit([[val] for val in self.norm_prob_plot[0]],
                     self.norm_prob_plot[1])

        self.y_intercept_prob = reg_prob.intercept_
        self.slope_prob = reg_prob.coef_
        self.x_trend_prob = [
            min(self.norm_prob_plot[0]),
            max(self.norm_prob_plot[0])
        ]
        self.y_trend_prob = np.add(
            np.multiply(self.x_trend_prob, self.slope_prob),
            self.y_intercept_prob)

        self.f_stat = regressors_stats.f_stat(self.ols, X, y)
        self.df_error = len(X[:, 0]) - len(X[0, :]) - 1
        self.df_model = len(X[0, :])

        self.f_p_value = scipy_stats.f.cdf(self.f_stat, self.df_model,
                                           self.df_error)
Ejemplo n.º 3
0
    def _modified_regressor_summary(clf, X, y, xlabels=None):
        """
        Output summary statistics for a fitted regression model.

        Parameters
        ----------
        clf : sklearn.linear_model
            A scikit-learn linear model classifier with a `predict()` method.
        X : numpy.ndarray
            Training data used to fit the classifier.
        y : numpy.ndarray
            Target training values, of shape = [n_samples].
        xlabels : list, tuple
            The labels for the predictors.
        """
        # Check and/or make xlabels
        ncols = X.shape[1]
        if xlabels is None:
            xlabels = np.array(
                ['x{0}'.format(i) for i in range(1, ncols + 1)], dtype='str')
        elif isinstance(xlabels, (tuple, list)):
            xlabels = np.array(xlabels, dtype='str')
        # Make sure dims of xlabels matches dims of X
        if xlabels.shape[0] != ncols:
            raise AssertionError(
                "Dimension of xlabels {0} does not match "
                "X {1}.".format(xlabels.shape, X.shape))
        # Create data frame of coefficient estimates and associated stats
        coef_df = pd.DataFrame(
            index=['_intercept'] + list(xlabels),
            columns=['Estimate', 'Std. Error', 't value', 'p value']
        )
        coef_df['Estimate'] = np.concatenate(
            (np.round(np.array([clf.intercept_]), 6), np.round((clf.coef_), 6)))
        coef_df['Std. Error'] = np.round(stats.coef_se(clf, X, y), 6)
        coef_df['t value'] = np.round(stats.coef_tval(clf, X, y), 4)
        coef_df['p value'] = np.round(stats.coef_pval(clf, X, y), 6)
        # Create data frame to summarize residuals
        resids = stats.residuals(clf, X, y, r_type='raw')
        resids_df = pd.DataFrame({
            'Min': pd.Series(np.round(resids.min(), 4)),
            '1Q': pd.Series(np.round(np.percentile(resids, q=25), 4)),
            'Median': pd.Series(np.round(np.median(resids), 4)),
            '3Q': pd.Series(np.round(np.percentile(resids, q=75), 4)),
            'Max': pd.Series(np.round(resids.max(), 4)),
        }, columns=['Min', '1Q', 'Median', '3Q', 'Max'])

        return resids_df, coef_df, {'R2': stats.metrics.r2_score(y, clf.predict(X)), 'Adj R2': stats.adj_r2_score(clf, X, y),
                                    'F-statistic': stats.f_stat(clf, X, y)}
Ejemplo n.º 4
0
 def calculate_f_stat(self):
     return stats.f_stat(self.model, self.params_df, self.result_nd)