Exemple #1
0
def test_manova_no_formula_no_hypothesis():
    # Same as previous test only skipping formula interface
    exog = add_constant(pd.get_dummies(X[['Loc']], drop_first=True))
    endog = X[['Basal', 'Occ', 'Max']]
    mod = MANOVA(endog, exog)
    r = mod.mv_test()
    assert isinstance(r, MultivariateTestResults)
Exemple #2
0
 def __init__(self):
     data = heart.load()
     endog = np.log10(data.endog)
     exog = add_constant(data.exog)
     self.mod1 = emplikeAFT(endog, exog, data.censors)
     self.res1 = self.mod1.fit()
     self.res2 = AFTRes()
 def test_forecast(self):
     end = len(self.true['data']['consump'])+15-1
     exog = add_constant(self.true['forecast_data']['m2'])
     assert_almost_equal(
         self.result.predict(end=end, exog=exog)[0],
         self.true['forecast'], 3
     )
def test_multiple_constraints():
    endog = dta['infl']
    exog = add_constant(dta[['m1', 'unemp', 'cpi']])

    constraints = [
        'm1 + unemp = 1',
        'cpi = 0',
    ]

    mod = RecursiveLS(endog, exog, constraints=constraints)
    res = mod.fit()

    # See tests/results/test_rls.do
    desired = [-0.7001083844336, -0.0018477514060, 1.0018477514060, 0]
    assert_allclose(res.params, desired, atol=1e-10)

    # See tests/results/test_rls.do
    desired = [.4699552366, .0005369357, .0005369357, 0]
    assert_allclose(res.bse[0], desired[0], atol=1e-1)
    assert_allclose(res.bse[1:-1], desired[1:-1], atol=1e-4)

    # See tests/results/test_rls.do
    desired = -534.4292052931121
    # Note that to compute what Stata reports as the llf, we need to use a
    # different denominator for estimating the scale, and then compute the
    # llf from the alternative recursive residuals
    scale_alternative = np.sum((
        res.standardized_forecasts_error[0, 1:] *
        res.filter_results.obs_cov[0, 0]**0.5)**2) / mod.nobs
    llf_alternative = np.log(norm.pdf(res.resid_recursive, loc=0,
                                      scale=scale_alternative**0.5)).sum()
    assert_allclose(llf_alternative, desired)
Exemple #5
0
 def setup_class(cls):
     data = heart.load()
     endog = np.log10(data.endog)
     exog = add_constant(data.exog)
     cls.mod1 = emplikeAFT(endog, exog, data.censors)
     cls.res1 = cls.mod1.fit()
     cls.res2 = AFTRes()
Exemple #6
0
def test_manova_no_formula():
    # Same as previous test only skipping formula interface
    exog = add_constant(pd.get_dummies(X[['Loc']], drop_first=True))
    endog = X[['Basal', 'Occ', 'Max']]
    mod = MANOVA(endog, exog)
    intercept = np.zeros((1, 3))
    intercept[0, 0] = 1
    loc = np.zeros((2, 3))
    loc[0, 1] = loc[1, 2] = 1
    hypotheses = [('Intercept', intercept), ('Loc', loc)]
    r = mod.mv_test(hypotheses)
    assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Value'],
                        0.60143661, decimal=8)
    assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Value'],
                        0.44702843, decimal=8)
    assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace",
                                             'Value'],
                        0.58210348, decimal=8)
    assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Value'],
                        0.35530890, decimal=8)
    assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'F Value'],
                        0.77, decimal=2)
    assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'F Value'],
                        0.86, decimal=2)
    assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace",
                                             'F Value'],
                        0.75, decimal=2)
    assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'F Value'],
                        1.07, decimal=2)
    assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Num DF'],
                        6, decimal=3)
    assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Num DF'],
                        6, decimal=3)
    assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace",
                                             'Num DF'],
                        6, decimal=3)
    assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Num DF'],
                        3, decimal=3)
    assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Den DF'],
                        16, decimal=3)
    assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Den DF'],
                        18, decimal=3)
    assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace",
                                             'Den DF'],
                        9.0909, decimal=4)
    assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Den DF'],
                        9, decimal=3)
    assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Pr > F'],
                        0.6032, decimal=4)
    assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Pr > F'],
                        0.5397, decimal=4)
    assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace",
                                             'Pr > F'],
                        0.6272, decimal=4)
    assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Pr > F'],
                        0.4109, decimal=4)
def test_plots():
    if not have_matplotlib:
        raise SkipTest

    exog = add_constant(dta[['m1', 'pop']])
    mod = RecursiveLS(endog, exog)
    res = mod.fit()

    # Basic plot
    fig = res.plot_recursive_coefficient()
    plt.close(fig)

    # Specific variable
    fig = res.plot_recursive_coefficient(variables=['m1'])
    plt.close(fig)

    # All variables
    fig = res.plot_recursive_coefficient(variables=[0, 'm1', 'pop'])
    plt.close(fig)

    # Basic plot
    fig = res.plot_cusum()
    plt.close(fig)

    # Other alphas
    for alpha in [0.01, 0.10]:
        fig = res.plot_cusum(alpha=alpha)
        plt.close(fig)

    # Invalid alpha
    assert_raises(ValueError, res.plot_cusum, alpha=0.123)

    # Basic plot
    fig = res.plot_cusum_squares()
    plt.close(fig)

    # Numpy input (no dates)
    mod = RecursiveLS(endog.values, exog.values)
    res = mod.fit()

    # Basic plot
    fig = res.plot_recursive_coefficient()
    plt.close(fig)

    # Basic plot
    fig = res.plot_cusum()
    plt.close(fig)

    # Basic plot
    fig = res.plot_cusum_squares()
    plt.close(fig)
    def __init__(self):
        # Remove the regression coefficients from the parameters, since they
        # will be estimated as part of the state vector
        true = dict(results_sarimax.friedman2_mle)
        exog = add_constant(true['data']['m2']) / 10.

        true['mle_params_exog'] = true['params_exog'][:]
        true['mle_se_exog'] = true['se_exog_oim'][:]

        true['params_exog'] = []
        true['se_exog'] = []

        super(TestFriedmanStateRegression, self).__init__(
            true, exog=exog, mle_regression=False
        )

        self.result = self.model.filter()
    def __init__(self, true, exog=None, *args, **kwargs):
        self.true = true
        endog = np.r_[true['data']['consump']]
        if exog is None:
            exog = add_constant(true['data']['m2'])

        kwargs.setdefault('simple_differencing', True)
        kwargs.setdefault('hamilton_representation', True)

        self.model = sarimax.SARIMAX(
            endog, exog=exog, order=(1, 0, 1), *args, **kwargs
        )

        params = np.r_[true['params_exog'], true['params_ar'],
                       true['params_ma'], true['params_variance']]

        self.model.update(params)
 def setup_class(cls):
     path = os.path.join(current_path, 'results', 'mar_filardo.csv')
     cls.mar_filardo = pd.read_csv(path)
     true = {
         'params': np.r_[4.35941747, -1.6493936, 1.7702123, 0.9945672,
                         0.517298, -0.865888,
                         np.exp(-0.362469)**2,
                         0.189474, 0.079344, 0.110944, 0.122251],
         'llf': -586.5718,
         'llf_fit': -586.5718,
         'llf_fit_em': -586.5718
     }
     endog = cls.mar_filardo['dlip'].iloc[1:].values
     exog_tvtp = add_constant(
         cls.mar_filardo['dmdlleading'].iloc[:-1].values)
     super(TestFilardo, cls).setup_class(
         true, endog, k_regimes=2, order=4, switching_ar=False,
         exog_tvtp=exog_tvtp)
Exemple #11
0
    def wls(data, use_bayes=False):
        """ Weighted least squares for peptides in protein.
            Operates on sub data frames """
        # Degenerate case, only one peptide
        if data.shape[0] == 1:
            return wls_degenerate(data)

        y = data['meanC'].append(data['meanE']).values
        if use_bayes:
            w = data['bayesSDC'].append(data['bayesSDE']).values**2
        else:
            w = data['stdC'].append(data['stdE']).values**2
        x = np.ones(data.shape[0]*2)
        x[:data.shape[0]] = 0
        
        mod_wls = sm.WLS(y, add_constant(x, prepend=False), weights=1./w)
        res_wls = mod_wls.fit()
        return (res_wls.params[0], res_wls.bse[0], res_wls.pvalues[0])
 def setup_class(cls):
     path = os.path.join(current_path, 'results', 'mar_filardo.csv')
     cls.mar_filardo = pd.read_csv(path)
     true = {
         'params': np.r_[4.35941747, -1.6493936, 1.7702123, 0.9945672,
                         0.517298, -0.865888,
                         np.exp(-0.362469)**2,
                         0.189474, 0.079344, 0.110944, 0.122251],
         'llf': -586.5718,
         'llf_fit': -586.5718,
         'llf_fit_em': -586.5718
     }
     endog = cls.mar_filardo['dlip'].iloc[1:].values
     exog_tvtp = add_constant(
         cls.mar_filardo['dmdlleading'].iloc[:-1].values)
     super(TestFilardo, cls).setup_class(
         true, endog, k_regimes=2, order=4, switching_ar=False,
         exog_tvtp=exog_tvtp)
Exemple #13
0
def _check_constant_params(a, has_const=False, use_const=True, rtol=1e-05,
                           atol=1e-08):
    """Helper func to interaction between has_const and use_const params.

    has_const   use_const   outcome
    ---------   ---------   -------
    True        True        Confirm that a has constant; return a
    False       False       Confirm that a doesn't have constant; return a
    False       True        Confirm that a doesn't have constant; add constant
    True        False       ValueError
    """

    if all((has_const, use_const)):
        if not _confirm_constant(a):
            raise ValueError('Data does not contain a constant; specify'
                             ' has_const=False')
        k = a.shape[-1] - 1
    elif not any((has_const, use_const)):
        if _confirm_constant(a):
            raise ValueError('Data already contains a constant; specify'
                             ' has_const=True')
        k = a.shape[-1]
    elif not has_const and use_const:
        # Also run a quick check to confirm that `a` is *not* ~N(0,1).
        #     In this case, constant should be zero. (exclude it entirely)
        c1 = np.allclose(a.mean(axis=0), b=0., rtol=rtol, atol=atol)
        c2 = np.allclose(a.std(axis=0), b=1., rtol=rtol, atol=atol)
        if c1 and c2:
            # TODO: maybe we want to just warn here?
            raise ValueError('Data appears to be ~N(0,1).  Specify'
                             ' use_constant=False.')
        # `has_constant` does checking on its own and raises VE if True
        try:
            a = add_constant(a, has_constant='raise')
        except ValueError as e:
            raise ValueError(
                'X data already contains a constant; please specify'
                ' has_const=True'
            ) from e
        k = a.shape[-1] - 1
    else:
        raise ValueError('`use_const` == False implies has_const is False.')

    return k, a
def test_plots(close_figures):
    exog = add_constant(dta[['m1', 'pop']])
    mod = RecursiveLS(endog, exog)
    res = mod.fit()

    # Basic plot
    try:
        from pandas.plotting import register_matplotlib_converters
        register_matplotlib_converters()
    except ImportError:
        pass
    fig = res.plot_recursive_coefficient()

    # Specific variable
    fig = res.plot_recursive_coefficient(variables=['m1'])

    # All variables
    fig = res.plot_recursive_coefficient(variables=[0, 'm1', 'pop'])

    # Basic plot
    fig = res.plot_cusum()

    # Other alphas
    for alpha in [0.01, 0.10]:
        fig = res.plot_cusum(alpha=alpha)

    # Invalid alpha
    assert_raises(ValueError, res.plot_cusum, alpha=0.123)

    # Basic plot
    fig = res.plot_cusum_squares()

    # Numpy input (no dates)
    mod = RecursiveLS(endog.values, exog.values)
    res = mod.fit()

    # Basic plot
    fig = res.plot_recursive_coefficient()

    # Basic plot
    fig = res.plot_cusum()

    # Basic plot
    fig = res.plot_cusum_squares()
Exemple #15
0
def test_plots(close_figures):
    exog = add_constant(dta[['m1', 'pop']])
    mod = RecursiveLS(endog, exog)
    res = mod.fit()

    # Basic plot
    try:
        from pandas.plotting import register_matplotlib_converters
        register_matplotlib_converters()
    except ImportError:
        pass
    fig = res.plot_recursive_coefficient()

    # Specific variable
    fig = res.plot_recursive_coefficient(variables=['m1'])

    # All variables
    fig = res.plot_recursive_coefficient(variables=[0, 'm1', 'pop'])

    # Basic plot
    fig = res.plot_cusum()

    # Other alphas
    for alpha in [0.01, 0.10]:
        fig = res.plot_cusum(alpha=alpha)

    # Invalid alpha
    assert_raises(ValueError, res.plot_cusum, alpha=0.123)

    # Basic plot
    fig = res.plot_cusum_squares()

    # Numpy input (no dates)
    mod = RecursiveLS(endog.values, exog.values)
    res = mod.fit()

    # Basic plot
    fig = res.plot_recursive_coefficient()

    # Basic plot
    fig = res.plot_cusum()

    # Basic plot
    fig = res.plot_cusum_squares()
Exemple #16
0
def trend(x, time='time', detrend=False):
    """returns trend per year"""
    from statsmodels.tools import add_constant
    year = 3600 * 24 * 365.24  # slope is w.r.t. seconds
    t = add_constant(x[time].values.astype('datetime64[s]').astype(float))
    lsq = np.linalg.lstsq(t, x.squeeze())[0]
    coords = [
        c for c in set(x.coords) - {time}
        if (x.coords[c].shape != ()) and (len(x.coords[c]) > 1)
    ]
    if len(coords) == 1:
        if detrend:
            return xr.DataArray(x - t.dot(lsq),
                                coords=[x.time, x.coords[coords[0]]])
        return xr.DataArray(lsq[1, :], coords=[x.coords[coords[0]]]) * year
    elif len(coords) == 0:
        return x - t.dot(lsq) if detrend else lsq[1] * year
    else:
        raise Exception('more than one additional coordinate')
def backwardElimination(input_matrix, output_array, significance_level=0.05):
    data = add_constant(input_matrix)
    candidate_variables = list(data.columns)
 
    # >1 because we've added a 'const' column
    while len(candidate_variables) > 1:
        data = data.loc[:, candidate_variables]
        regressor = sm.OLS(endog=output_array, exog=data).fit()
        worst_index, p_value = max(enumerate(regressor.pvalues), key=itemgetter(1))
        if p_value > significance_level:
            print(f"Eliminating '{candidate_variables[worst_index]}' with p-value {p_value:.2}")
            del candidate_variables[worst_index]
        else:
            print(f"Final variable selection: {candidate_variables[1:]}")
            print(regressor.summary())
            return data.loc[:, candidate_variables[1:]]
 
    print("No significant correlation found for any variables")
    return None
Exemple #18
0
    def computeForDay(self, strategy, timeSeriesTick, timeSeriesTrade):
        timeSeriesReg = timeSeriesTick.resample(
            str(int(self.resamplePeriod)) + "S"
        ).first()
        timeSeriesReg = timeSeriesReg.fillna(method="pad")
        timeTable = timeSeriesReg.to_frame()
        timeTable["second"] = timeSeriesReg.index.astype(np.int64)
        timeTable["second"] = (timeTable["second"] - timeTable["second"][0]) / math.pow(
            10, 9
        )

        # self.betaSeries = pd.stats.ols.MovingOLS(y=timeTable['price'], x=timeTable['second'], window_type='rolling', window = self.period, intercept=True).beta
        mod = RollingOLS(
            timeTable["price"],
            add_constant(timeTable["second"], prepend=False),
            window=self.period,
        )
        self.betaSeries = mod.fit().params
        return {"betaSeries": self.betaSeries}
def ols_fit_train(y_array,df,col_list):
    '''
    Takes df and string name of y column name in df, uses patsy_input_str to create string and inputs into feature matrix.
    Outputs OLS fit summary.
    '''
    input_string = patsy_input_str(df,y_col_name)
    # Create your feature matrix (X) and target vector (y)

    y, X = patsy.dmatrices(input_string, data=df, return_type="dataframe")

    #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=10)
    
    # Create your model
    model = sm.OLS(y_array, add_constant(df.loc[:,col_list]))

    # Fit your model to your training set
    fit = model.fit()

    # Print summary statistics of the model's performance
    return fit.summary()
Exemple #20
0
def basic_logistic_regression(df,
                              cutoff,
                              col='pop_bin',
                              rand=0,
                              sig_only=False):
    df = df.copy()
    X, y = return_X_y_logistic(split_sample_combine(df, cutoff, col,
                                                    rand=rand))
    X = standardize_X(X)
    X_const = add_constant(X, prepend=True)
    print("X_const\n", X_const)
    print("Y\n", y)

    logit_model = Logit(y, X_const).fit(solver='lbfgs',
                                        skip_hessian=True,
                                        max_iter=20000)

    print(logit_model.summary())

    return logit_model
Exemple #21
0
def linear_regression(Y,
                      X,
                      multiple_X=1,
                      fix_nan=True,
                      alfa=False,
                      integrate=False):
    """
    Using a package, it rebuild here for ease of use.
    It is 100 % statsmodels OLS.
    Y(list[float]) - dependent variable;
    X([list[float] | float]) - independent variable;
    multiple_X [int, default=1] if there are mutiple factors (Xes), set the number of factor columns
    """
    Y = np.array(Y).reshape(-1, 1)
    X = np.array(X).reshape(-1, multiple_X)
    if integrate:
        integrations, Y, X = data_tests.stationarity.forceSTATxy(Y, X)
    if alfa:
        X = add_constant(X)
    model = OLS(Y, X, missing='drop' if fix_nan else 'none').fit()
    return model if not integrate else (model, integrations)
def compute_variance_inflation_factor(X: np.array) -> np.array:
    """
    Compute the variance inflation factor for each features of a matrix

    Parameters
    ----------
    {X}

    Returns
    -------
    vif: np.array of shape = (n_features)
        variance inflation factor of each features of the input matrix
    """

    # Add a constant columns as suggest here:
    # https://stackoverflow.com/questions/42658379/variance-inflation-factor-in-python
    X = add_constant(X, prepend=True)
    vif = np.array([oi.variance_inflation_factor(X, j) for j in range(X.shape[1])])
    # remove the firs element corresponding to the constant col
    vif = np.delete(vif, 0)
    vif[np.isnan(vif)] = np.inf

    return vif
    def fit(self, X, y, force_include_idx=None):
        ''' Estimate a model using Post-Lasso

        X: X matrix (without intercept)
        y: y vector
        force_include_idx: column indexes that ALWAYS is
            included in the OLS model, regardless of their
            status in the lasso stage.
        '''
        self.lasso_model = self.lasso_model.fit(X, y)
        self.coefs = np.insert(
            self.lasso_model.coef_, 0,
            self.lasso_model.intercept_)  # inserts intercepts in the first col
        self.subset_cols = np.where(self.coefs != 0)[
            0]  # select variables for which the coef after lasso is not zero
        if force_include_idx is not None:  # add cols defined in force_include_idx to subset_cols
            self.subset_cols = np.union1d(self.subset_cols, force_include_idx)
        self.relevant_x = add_constant(
            X
        )[:, self.
          subset_cols]  # add constant to X and choose only the subset cols
        self.ols_model = OLS(y, self.relevant_x).fit()
        return self
Exemple #24
0
def variance_inflation_factors(exog_df):
    '''
    Parameters
    ----------
    exog_df : dataframe, (nobs, k_vars)
        design matrix with all explanatory variables, as for example used in
        regression.

    Returns
    -------
    vif : Series
        variance inflation factors
    '''
    exog_df = add_constant(exog_df)
    vifs = pd.Series([
        1 / (1. -
             OLS(exog_df[col].values,
                 exog_df.loc[:, exog_df.columns != col].values).fit().rsquared)
        for col in exog_df
    ],
                     index=exog_df.columns,
                     name='VIF')
    return vifs
Exemple #25
0
def gen_data(nobs, nvar, const, pandas=False, missing=0.0, weights=False):
    rs = np.random.RandomState(987499302)
    x = rs.standard_normal((nobs, nvar))
    cols = ["x{0}".format(i) for i in range(nvar)]
    if const:
        x = tools.add_constant(x)
        cols = ["const"] + cols
    if missing > 0.0:
        mask = rs.random_sample(x.shape) < missing
        x[mask] = np.nan
    if x.shape[1] > 1:
        y = x[:, :-1].sum(1) + rs.standard_normal(nobs)
    else:
        y = x.sum(1) + rs.standard_normal(nobs)
    w = rs.chisquare(5, y.shape[0]) / 5
    if pandas:
        idx = pd.date_range("12-31-1999", periods=nobs)
        x = pd.DataFrame(x, index=idx, columns=cols)
        y = pd.Series(y, index=idx, name="y")
        w = pd.Series(w, index=idx, name="weights")
    if not weights:
        w = None

    return y, x, w
Exemple #26
0
def calculate_QQplot(data1, data2, a=0):
    def _sample_quantiles(data):
        probplot = gofplots.ProbPlot(np.array(data, dtype=float), a=a)
        return probplot.sample_quantiles

    def _match_quantile_probabilities(quantiles1, quantiles2):
        if len(quantiles1) > len(quantiles2):
            quantiles2, quantiles1 = _match_quantile_probabilities(
                quantiles2, quantiles1)
        else:
            N_obs = len(quantiles1)
            probs = gofplots.plotting_pos(N_obs, a)
            quantiles2 = scstats.mstats.mquantiles(quantiles2, probs)

        return quantiles1, quantiles2

    s1, s2 = _sample_quantiles(data1), _sample_quantiles(data2)
    s1, s2 = _match_quantile_probabilities(s1, s2)

    linreg_result = OLS(s2, add_constant(s1)).fit()
    s2_fitted = linreg_result.fittedvalues
    r = np.sqrt(linreg_result.rsquared)

    return s1, s2, s2_fitted, r
Exemple #27
0
def glm_regularized_AIC(X,
                        Y,
                        reg_mod,
                        unreg_mod,
                        tol=1e-6,
                        method="kawano",
                        family="binomial"):
    """
    Calculate AIC for a Generalized Linear Model with regularization.
    
    See 'AIC for the Lasso in GLMs', Y. Ninomiya and S. Kawano (2016)

    Parameters
    ----------
    X : numpy array or pandas dataframe
        Feature or design matrix.
    Y : numpy array or pandas series
        Target or response variable.
    reg_mod : sklearn, or similar
        The regularized model.
    unreg_mod : sklearn, or similar
        The unregularized model.
    tol : float, optional
        Tolerance cutoff for counting non-zero coefficients. The default is
        1e-6.
    method : str, optional
        The method for calculating the AIC. Either `kawano` or `Hastie`.
        The default is "kawano".
    family : str, optional
        The type of generalised linear model. The default is "binomial".

    Raises
    ------
    ValueError
        Raised if an invalid family is picked.

    Returns
    -------
    aic : float
        The calculated AIC.

    """
    # requires predict_proba method for logreg, and predict method for others, for poisson, predict output should be lambda, i.e. it should already be exponentiated
    if isinstance(X, pd.DataFrame):
        X = X.values
    if isinstance(Y, pd.DataFrame or pd.Series):
        Y = Y.values
    aic = None
    if family == "binomial":
        nllf = glm_likelihood_bernoulli
        unreg_prob = unreg_mod.predict_proba(X)[:, 1]
        reg_prob = reg_mod.predict_proba(X)[:, 1]
        y_mat_unreg = np.diag(unreg_prob * (1 - unreg_prob))
        y_mat_reg = np.diag(reg_prob * (1 - reg_prob))
    elif family == "poisson":
        nllf = glm_likelihood_poisson
        unreg_pred = unreg_mod.predict(X)
        reg_pred = reg_mod.predict(X)
        y_mat_unreg = np.diag(unreg_prob)
        y_mat_reg = np.diag(reg_prob)
    elif family == "gaussian":
        nllf = glm_likelihood_gaussian
        unreg_pred = unreg_mod.predict(X)
        reg_pred = reg_mod.predict(X)
        y_mat_unreg = np.diag(unreg_pred)
        y_mat_reg = np.diag(reg_pred)
    else:
        raise ValueError("Not a valid family")

    reg_mod_coef = np.concatenate(
        (reg_mod.intercept_, np.squeeze(reg_mod.coef_)))
    nonzero_idx = np.where(
        [True if np.abs(coef) > tol else False for coef in reg_mod_coef])[0]
    if method == "kawano":
        X_nz = add_constant(X[:, nonzero_idx], prepend=True)

        j22 = np.linalg.multi_dot([X_nz.T, y_mat_reg, X_nz])
        j22_2 = np.linalg.multi_dot([X_nz.T, y_mat_unreg, X_nz])

        negloglike = nllf(reg_mod_coef, X, Y, lamb=0, l_norm=0)
        aic = 2 * negloglike + np.sum(np.diag(np.linalg.inv(j22).dot(j22_2)))
    else:
        # Tibshirani, Hastie, Zou 2007, On the degrees of freedom on the lasso
        negloglike = nllf(reg_mod_coef, X, Y, lamb=0, l_norm=0)
        # Not 100% sure on this calculation. should it be 2*len(nonzero_idx),
        #the count of nonzero columns, or 2*mean(non_zero_idx)?
        aic = 2 * negloglike + 2 * len(nonzero_idx)
    return aic
Exemple #28
0
if len(df) == 0:
    model = irimodel
else:
    pred = irimodel.predict(df['station.longitude'].values,
                            df['station.latitude'].values)
    error = pred - df[metric].values
    print(df[metric].values)
    print(pred)
    print(error)
    print(np.sqrt(np.sum(error**2) / np.sum(df.cs.values)),
          np.sum(error) / np.sum(df.cs.values))

    if metric in ['mufd', 'fof2']:
        wls_model = sm.WLS(df[metric].values - pred,
                           add_constant(pred, prepend=False), df.cs.values)
        wls_fit = wls_model.fit_regularized(alpha=np.array([1, 3]), L1_wt=0)
        coeff = wls_fit.params
        coeff[0] = coeff[0] + 1
        print(coeff)

        irimodel = LinearModel(irimodel, coeff[0], coeff[1])
        pred = irimodel.predict(df['station.longitude'].values,
                                df['station.latitude'].values)
        error = pred - df[metric].values
        print(df[metric].values)
        print(pred)
        print(error)
        print(np.sqrt(np.sum(error**2) / np.sum(df.cs.values)),
              np.sum(error) / np.sum(df.cs.values))
Exemple #29
0
def test_glm(constraints=None):
    # More comprehensive tests against GLM estimates (this is sort of redundant
    # given `test_ols`, but this is mostly to complement the tests in
    # `test_glm_constrained`)
    endog = dta.infl
    exog = add_constant(dta[['unemp', 'm1']])

    mod = RecursiveLS(endog, exog, constraints=constraints)
    res = mod.fit()

    mod_glm = GLM(endog, exog)
    if constraints is None:
        res_glm = mod_glm.fit()
    else:
        res_glm = mod_glm.fit_constrained(constraints=constraints)

    # Regression coefficients, standard errors, and estimated scale
    assert_allclose(res.params, res_glm.params)
    assert_allclose(res.bse, res_glm.bse, atol=1e-6)
    # Note: scale here is computed according to Harvey, 1989, 4.2.5, and is
    # the called the ML estimator and sometimes (e.g. later in section 5)
    # denoted \tilde \sigma_*^2
    assert_allclose(res.filter_results.obs_cov[0, 0], res_glm.scale)

    # DoF
    # Note: GLM does not include intercept in DoF, so modify by -1
    assert_equal(res.df_model - 1, res_glm.df_model)

    # OLS residuals are equivalent to smoothed forecast errors
    # (the latter are defined as e_t|T by Harvey, 1989, 5.4.5)
    # (this follows since the smoothed state simply contains the
    # full-information estimates of the regression coefficients)
    actual = (mod.endog[:, 0] -
              np.sum(mod['design', 0, :, :] * res.smoothed_state, axis=0))
    assert_allclose(actual, res_glm.resid_response, atol=1e-7)

    # Given the estimate of scale as `sum(v_t^2 / f_t) / (T - d)` (see
    # Harvey, 1989, 4.2.5 on p. 183), then llf_recursive is equivalent to the
    # full OLS loglikelihood (i.e. without the scale concentrated out).
    desired = mod_glm.loglike(res_glm.params, scale=res_glm.scale)
    assert_allclose(res.llf_recursive, desired)
    # Alternatively, we can construct the concentrated OLS loglikelihood
    # by computing the scale term with `nobs` in the denominator rather than
    # `nobs - d`.
    scale_alternative = np.sum(
        (res.standardized_forecasts_error[0, 1:] *
         res.filter_results.obs_cov[0, 0]**0.5)**2) / mod.nobs
    llf_alternative = np.log(
        norm.pdf(res.resid_recursive, loc=0,
                 scale=scale_alternative**0.5)).sum()
    assert_allclose(llf_alternative, res_glm.llf)

    # Prediction
    # TODO: prediction in this case is not working.
    if constraints is None:
        design = np.ones((1, 3, 10))
        actual = res.forecast(10, design=design)
        assert_allclose(actual, res_glm.predict(np.ones((10, 3))))
    else:
        design = np.ones((2, 3, 10))
        assert_raises(NotImplementedError, res.forecast, 10, design=design)

    # Hypothesis tests
    actual = res.t_test('m1 = 0')
    desired = res_glm.t_test('m1 = 0')
    assert_allclose(actual.statistic, desired.statistic)
    assert_allclose(actual.pvalue, desired.pvalue, atol=1e-15)

    actual = res.f_test('m1 = 0')
    desired = res_glm.f_test('m1 = 0')
    assert_allclose(actual.statistic, desired.statistic)
    assert_allclose(actual.pvalue, desired.pvalue)

    # Information criteria
    # Note: the llf and llf_obs given in the results are based on the Kalman
    # filter and so the ic given in results will not be identical to the
    # OLS versions. Additionally, llf_recursive is comparable to the
    # non-concentrated llf, and not the concentrated llf that is by default
    # used in OLS. Compute new ic based on llf_alternative to compare.
    actual_aic = aic(llf_alternative, res.nobs_effective, res.df_model)
    assert_allclose(actual_aic, res_glm.aic)
Exemple #30
0
except ImportError:
    have_matplotlib = False

current_path = os.path.dirname(os.path.abspath(__file__))

results_R_path = 'results' + os.sep + 'results_rls_R.csv'
results_R = pd.read_csv(current_path + os.sep + results_R_path)

results_stata_path = 'results' + os.sep + 'results_rls_stata.csv'
results_stata = pd.read_csv(current_path + os.sep + results_stata_path)

dta = macrodata.load_pandas().data
dta.index = pd.date_range(start='1959-01-01', end='2009-07-01', freq='QS')

endog = dta['cpi']
exog = add_constant(dta['m1'])


def test_endog():
    # Tests for numpy input
    mod = RecursiveLS(endog.values, exog.values)
    res = mod.fit()

    # Test the RLS estimates against OLS estimates
    mod_ols = OLS(endog, exog)
    res_ols = mod_ols.fit()
    assert_allclose(res.params, res_ols.params)

    # Tests for 1-dim exog
    mod = RecursiveLS(endog, dta['m1'].values)
    res = mod.fit()
Exemple #31
0
ax.set_xlabel('YEAR')
ax.set_ylabel('DEC')
plt.show()

from sklearn import linear_model, feature_selection, preprocessing
from sklearn.model_selection import train_test_split
import statsmodels.formula.api as sm
from statsmodels.tools.eval_measures import mse
from statsmodels.tools import add_constant
from sklearn.metrics import mean_squared_error

X = df.values.copy()
X_train, X_valid, y_train, y_valid = train_test_split(X[:, :-1],
                                                      X[:, -1],
                                                      train_size=0.80)
result = sm.OLS(y_train, add_constant(X_train)).fit()
result.summary()
result = sm.OLS(y_train, add_constant(X_train)).fit()
result.summary()
ypred = result.predict(add_constant(X_valid))
print(mse(ypred, y_valid))
fig, ax = plt.subplots(1, 1)
ax.scatter(y_valid, ypred)
ax.set_xlabel('Actual')
ax.set_ylabel('Prediction')
plt.show()

# In[ ]:

# In[ ]:
  square = lambda row: row**2
  sum_of_squares = df['difference'].apply(square).sum()
  return(sum_of_squares)

x0 = [-20, .0008, 1.1]
estimator(x0)
optimize.minimize(estimator, x0, method='nelder-mead', options={'xtol': 1e-8, 'disp': True})

clf = linear_model.LinearRegression()
x = df[['AADT', 'L']].as_matrix()
y = df['Crashes']
clf.fit(x, y)
clf.coef_
clf.intercept_

model = OLS(y, add_constant(x))
model_fit = model.fit()
model_fit.summary()

def estimator(x, row_in='Crashes'):
  estimated = lambda row: exp(x[0] + x[1] * row['AADT'] + x[2] * row['L'])
  df['estimated'] = df.apply(estimated, axis=1)
  #probability = lambda row: (row['estimated']**row[row_in] * exp(-row['estimated'])) / factorial(row[row_in])
  probability = lambda row: poisson.pmf(row[row_in], row['estimated'])
  df['probability'] = df.apply(probability, axis=1)
  product = df['probability'].product()
  return(-product)

x0 = [1.6, .0000026, .032]
estimator(x0)
optimize.minimize(estimator, x0, method='nelder-mead', options={'xtol': 1e-8, 'disp': True})
plt.show()

# 1.4

df.hist()
plt.show()


# Part 2
# 2.1

from statsmodels.discrete.discrete_model import Logit
from statsmodels.tools import add_constant

X = df[['gre', 'gpa', 'rank']].values
X_const = add_constant(X, prepend=True)
y = df['admit'].values

logit_model = Logit(y, X_const).fit()

# 2.2

logit_model.summary()

# 2.3

import numpy as np
from sklearn.cross_validation import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score
from statsmodels.tools import add_constant
from numpy.testing import assert_equal, assert_raises, assert_allclose

current_path = os.path.dirname(os.path.abspath(__file__))

results_R_path = 'results' + os.sep + 'results_rls_R.csv'
results_R = pd.read_csv(current_path + os.sep + results_R_path)

results_stata_path = 'results' + os.sep + 'results_rls_stata.csv'
results_stata = pd.read_csv(current_path + os.sep + results_stata_path)

dta = macrodata.load_pandas().data
dta.index = pd.date_range(start='1959-01-01', end='2009-07-01', freq='QS')

endog = dta['cpi']
exog = add_constant(dta['m1'])


def test_endog():
    # Tests for numpy input
    mod = RecursiveLS(endog.values, exog.values)
    res = mod.fit()

    # Test the RLS estimates against OLS estimates
    mod_ols = OLS(endog, exog)
    res_ols = mod_ols.fit()
    assert_allclose(res.params, res_ols.params)

    # Tests for 1-dim exog
    mod = RecursiveLS(endog, dta['m1'].values)
    res = mod.fit()
Exemple #35
0
 def __init__(self):
     data = stackloss.load()
     data.exog = add_constant(data.exog)
     self.res1 = OLS(data.endog, data.exog).fit()
     self.res2 = RegressionResults()
Exemple #36
0
sn.countplot(x='TenYearCHD', data=heart_df)

# There are 3179 patents with no heart disease and 572 patients with risk of heart disease.

# In[21]:

sn.pairplot(data=heart_df)

# In[22]:

heart_df.describe()

# In[23]:

from statsmodels.tools import add_constant as add_constant
heart_df_constant = add_constant(heart_df)
heart_df_constant.head()

# In[24]:

st.chisqprob = lambda chisq, df: st.chi2.sf(chisq, df)
cols = heart_df_constant.columns[:-1]
model = sm.Logit(heart_df.TenYearCHD, heart_df_constant[cols])
result = model.fit()
result.summary()

# In[43]:


def back_feature_elem(data_frame, dep_var, col_list):
    """ Takes in the dataframe, the dependent variable and a list of column names, runs the regression repeatedly eleminating feature with the highest
Exemple #37
0
# label encoding and one hot encoding categorical variables: Pclass, Sex, Embarked
dataframe = pd.get_dummies(dataframe,
                           columns=['Pclass', 'Sex'],
                           drop_first=True)

# for now feature scaling seems unnecessary, but we'll add it later if it turns out to be required

# extract independent and dependent variable matrices
X = dataframe.drop(labels=['PassengerId', 'Survived'], axis=1)
y = dataframe._getitem_column('Survived')

## Backward Elimination

# add a column of 1s to represent x0 variable (intercept)
X = smtools.add_constant(X)

# use Backward Elimination to get rid of insignificant variables
significance_level = 0.05
X = toolkit.backward_elimination_using_pvalues(X, y, significance_level)
# X = toolkit.backward_elimination_using_adjR2(X, y)

# Fitting Decision Tree Classification to the Training set
accuracies = {}
std = {}
classifier = DecisionTreeClassifier()
accuracies['Decision Tree'], std['Decision Tree'] = classifier.classify(X, y)

# Fitting Random Forest Classification to the Training set
classifier = RandomForestClassifier()
accuracies['Random Forest'], std['Random Forest'] = classifier.classify(X, y)
y = np.array([data['sales']]).reshape(-1, 1)

# %%
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X, y)
score = model.score(X, y)
print(f"R2 Score: {score}")
print('Weight coefficients: ', model.coef_)

# %%

# << Statsmodels >>

# X need to add constant to match the results from scikit learn
X1 = stat.add_constant(X)
model = sm.OLS(y, X1)
results = model.fit()
print(results.summary())

# %%
# Regression Model with Qualitative Predictors ---------------

# load data
path = '/Users/michaelshih/Documents/code/education/statistical_learining/'
subfolder = 'resource'
filename = 'Credit.csv'
filedir = os.path.join(path, subfolder, filename)
print(filedir)

data = pd.read_csv(filedir, index_col=0)
Exemple #39
0
 def test_design(self):
     npt.assert_equal(self.model.exog,
                      add_constant(self.data.exog, prepend=True))
Exemple #40
0
 def test_design(self):
     npt.assert_equal(self.model.exog,
                      add_constant(self.data.exog, prepend=True))
def test_glm(constraints=None):
    # More comprehensive tests against GLM estimates (this is sort of redundant
    # given `test_ols`, but this is mostly to complement the tests in
    # `test_glm_constrained`)
    endog = dta.infl
    exog = add_constant(dta[['unemp', 'm1']])

    mod = RecursiveLS(endog, exog, constraints=constraints)
    res = mod.fit()

    mod_glm = GLM(endog, exog)
    if constraints is None:
        res_glm = mod_glm.fit()
    else:
        res_glm = mod_glm.fit_constrained(constraints=constraints)

    # Regression coefficients, standard errors, and estimated scale
    assert_allclose(res.params, res_glm.params)
    assert_allclose(res.bse, res_glm.bse, atol=1e-6)
    # Note: scale here is computed according to Harvey, 1989, 4.2.5, and is
    # the called the ML estimator and sometimes (e.g. later in section 5)
    # denoted \tilde \sigma_*^2
    assert_allclose(res.filter_results.obs_cov[0, 0], res_glm.scale)

    # DoF
    # Note: GLM does not include intercept in DoF, so modify by -1
    assert_equal(res.df_model - 1, res_glm.df_model)

    # OLS residuals are equivalent to smoothed forecast errors
    # (the latter are defined as e_t|T by Harvey, 1989, 5.4.5)
    # (this follows since the smoothed state simply contains the
    # full-information estimates of the regression coefficients)
    actual = (mod.endog[:, 0] -
              np.sum(mod['design', 0, :, :] * res.smoothed_state, axis=0))
    assert_allclose(actual, res_glm.resid_response, atol=1e-7)

    # Given the estimate of scale as `sum(v_t^2 / f_t) / (T - d)` (see
    # Harvey, 1989, 4.2.5 on p. 183), then llf_recursive is equivalent to the
    # full OLS loglikelihood (i.e. without the scale concentrated out).
    desired = mod_glm.loglike(res_glm.params, scale=res_glm.scale)
    assert_allclose(res.llf_recursive, desired)
    # Alternatively, we can construct the concentrated OLS loglikelihood
    # by computing the scale term with `nobs` in the denominator rather than
    # `nobs - d`.
    scale_alternative = np.sum((
        res.standardized_forecasts_error[0, 1:] *
        res.filter_results.obs_cov[0, 0]**0.5)**2) / mod.nobs
    llf_alternative = np.log(norm.pdf(res.resid_recursive, loc=0,
                                      scale=scale_alternative**0.5)).sum()
    assert_allclose(llf_alternative, res_glm.llf)

    # Prediction
    # TODO: prediction in this case is not working.
    if constraints is None:
        design = np.ones((1, 3, 10))
        actual = res.forecast(10, design=design)
        assert_allclose(actual, res_glm.predict(np.ones((10, 3))))
    else:
        design = np.ones((2, 3, 10))
        assert_raises(NotImplementedError, res.forecast, 10, design=design)

    # Hypothesis tests
    actual = res.t_test('m1 = 0')
    desired = res_glm.t_test('m1 = 0')
    assert_allclose(actual.statistic, desired.statistic)
    assert_allclose(actual.pvalue, desired.pvalue, atol=1e-15)

    actual = res.f_test('m1 = 0')
    desired = res_glm.f_test('m1 = 0')
    assert_allclose(actual.statistic, desired.statistic)
    assert_allclose(actual.pvalue, desired.pvalue)

    # Information criteria
    # Note: the llf and llf_obs given in the results are based on the Kalman
    # filter and so the ic given in results will not be identical to the
    # OLS versions. Additionally, llf_recursive is comparable to the
    # non-concentrated llf, and not the concentrated llf that is by default
    # used in OLS. Compute new ic based on llf_alternative to compare.
    actual_aic = aic(llf_alternative, res.nobs_effective, res.df_model)
    assert_allclose(actual_aic, res_glm.aic)
Exemple #42
0
    def _EM_test(self, nuisance_params, params=None, param_nums=None,
                 b0_vals=None, F=None, survidx=None, uncens_nobs=None,
                numcensbelow=None, km=None, uncensored=None, censored=None,
                maxiter=None, ftol=None):
        """
        Uses EM algorithm to compute the maximum likelihood of a test

        Parameters
        ---------

        Nuisance Params: array
            Vector of values to be used as nuisance params.

        maxiter: int
            Number of iterations in the EM algorithm for a parameter vector

        Returns
        -------
        -2 ''*'' log likelihood ratio at hypothesized values and
        nuisance params

        Notes
        -----
        Optional parameters are provided by the test_beta function.
        """
        iters = 0
        params[param_nums] = b0_vals

        nuis_param_index = np.int_(np.delete(np.arange(self.model.nvar),
                                           param_nums))
        params[nuis_param_index] = nuisance_params
        to_test = params.reshape(self.model.nvar, 1)
        opt_res = np.inf
        diff = np.inf
        while iters < maxiter and diff > ftol:
            F = F.flatten()
            death = np.cumsum(F[::-1])
            survivalprob = death[::-1]
            surv_point_mat = np.dot(F.reshape(-1, 1),
                                1. / survivalprob[survidx].reshape(1, - 1))
            surv_point_mat = add_constant(surv_point_mat)
            summed_wts = np.cumsum(surv_point_mat, axis=1)
            wts = summed_wts[np.int_(np.arange(uncens_nobs)),
                             numcensbelow[uncensored]]
            # ^E step
            # See Zhou 2005, section 3.
            self.model._fit_weights = wts
            new_opt_res = self._opt_wtd_nuis_regress(to_test)
                # ^ Uncensored weights' contribution to likelihood value.
            F = self.new_weights
                # ^ M step
            diff = np.abs(new_opt_res - opt_res)
            opt_res = new_opt_res
            iters = iters + 1
        death = np.cumsum(F.flatten()[::-1])
        survivalprob = death[::-1]
        llike = -opt_res + np.sum(np.log(survivalprob[survidx]))
        wtd_km = km.flatten() / np.sum(km)
        survivalmax = np.cumsum(wtd_km[::-1])[::-1]
        llikemax = np.sum(np.log(wtd_km[uncensored])) + \
          np.sum(np.log(survivalmax[censored]))
        if iters == maxiter:
            warnings.warn('The EM reached the maximum number of iterations',
                          IterationLimitWarning)
        return -2 * (llike - llikemax)
 def setup_class(cls):
     data = stackloss.load(as_pandas=False)
     data.exog = add_constant(data.exog)
     cls.res1 = OLS(data.endog, data.exog).fit()
     cls.res2 = RegressionResults()
def fitPoisson(X, Y):
    X = add_constant(X)
    return sm.GLM(Y, X, family=sm.families.Poisson()).fit(disp=0)
Exemple #45
0
### accuracy of probabilistic predictions in a set of mutually
### exclusive outcomes i.e. default, non-default

model_vars = [
    'term',
    'home_ownership',
    'grade',
    'purpose',
    'emp_length',
]
continous_vars = ['funded_amnt', 'dti']

le = preprocessing.LabelEncoder()
y = df_sample['default'].reset_index(drop=True)
X = pd.DataFrame([])
for var in model_vars:
    X[var] = le.fit_transform(df_sample[var])

for i in continous_vars:
    X[i] = df_sample[i].reset_index(drop=True)

#Add Constant
X = smt.add_constant(X)
# Regression Analysis
logit_model = sm.Logit(y, X)
result = logit_model.fit(disp=0)

y_true = df_sample['default']
y_pred = result.predict()

print(brier_score_loss(y_true, y_pred))
 def setup_class(cls):
     data = stackloss.load(as_pandas=False)
     data.exog = add_constant(data.exog)
     cls.res1 = OLS(data.endog, data.exog).fit()
     cls.res2 = RegressionResults()
Exemple #47
0
# VIF
# The vif of each column is ok. All of them are smaller than 5, even 2.
def variance_inflation_factor(exog, exog_idx):
    k_vars = exog.shape[1]
    x_i = exog.iloc[:, exog_idx]
    mask = np.arange(k_vars) != exog_idx
    x_noti = exog.iloc[:, mask]
    r_squared_i = OLS(x_i, x_noti).fit().rsquared
    vif = 1. / (1. - r_squared_i)
    return vif

# VIF of each column

# we skip the constant column
VIF = [variance_inflation_factor(add_constant(X), i) for i in range(1,X.shape[1]+1)]


regr_1 = OLS(y, add_constant(X)).fit()

# residual distribution
sns.distplot(regr_1.resid) # acting like normal which is good

# since the residual itself is normal, box-cox is not necessary.
# namda = 0.1
# regr_test = OLS((y**namda-1)/namda, add_constant(X)).fit()
# sns.jointplot((y**namda-1)/namda, regr_test.resid)
# sns.distplot(regr_test.resid)

sns.jointplot(y, regr_1.resid) # which looks very strange. maybe the model is not linear at the first place.
#since there is explicit non-linear in this model, we have to add some non-linear covariates in it.