Example #1
0
def test_influence_dtype():
    # see #2148  bug when endog is integer
    y = np.ones(20)
    np.random.seed(123)
    x = np.random.randn(20, 3)
    res1 = OLS(y, x).fit()

    res2 = OLS(y*1., x).fit()
    cr1 = res1.get_influence().cov_ratio
    cr2 = res2.get_influence().cov_ratio
    assert_allclose(cr1, cr2, rtol=1e-14)
    # regression test for values
    cr3 = np.array(
      [ 1.22239215,  1.31551021,  1.52671069,  1.05003921,  0.89099323,
        1.57405066,  1.03230092,  0.95844196,  1.15531836,  1.21963623,
        0.87699564,  1.16707748,  1.10481391,  0.98839447,  1.08999334,
        1.35680102,  1.46227715,  1.45966708,  1.13659521,  1.22799038])
    assert_almost_equal(cr1, cr3, decimal=8)
Example #2
0
def test_outlier_influence_funcs():
    # smoke test
    x = add_constant(np.random.randn(10, 2))
    y = x.sum(1) + np.random.randn(10)
    res = OLS(y, x).fit()
    oi.summary_table(res, alpha=0.05)

    res2 = OLS(y, x[:, 0]).fit()
    oi.summary_table(res2, alpha=0.05)
    infl = res2.get_influence()
    infl.summary_table()
Example #3
0
def test_outlier_influence_funcs():
    #smoke test
    x = add_constant(np.random.randn(10, 2))
    y = x.sum(1) + np.random.randn(10)
    res = OLS(y, x).fit()
    oi.summary_table(res, alpha=0.05)

    res2 = OLS(y, x[:,0]).fit()
    oi.summary_table(res2, alpha=0.05)
    infl = res2.get_influence()
    infl.summary_table()
def test_outlier_influence_funcs(reset_randomstate):
    x = add_constant(np.random.randn(10, 2))
    y = x.sum(1) + np.random.randn(10)
    res = OLS(y, x).fit()
    out_05 = oi.summary_table(res)
    # GH3344 : Check alpha has an effect
    out_01 = oi.summary_table(res, alpha=0.01)
    assert_(np.all(out_01[1][:, 6] <= out_05[1][:, 6]))
    assert_(np.all(out_01[1][:, 7] >= out_05[1][:, 7]))

    res2 = OLS(y, x[:, 0]).fit()
    oi.summary_table(res2, alpha=0.05)
    infl = res2.get_influence()
    infl.summary_table()
Example #5
0
def test_outlier_influence_funcs(reset_randomstate):
    x = add_constant(np.random.randn(10, 2))
    y = x.sum(1) + np.random.randn(10)
    res = OLS(y, x).fit()
    out_05 = oi.summary_table(res)
    # GH3344 : Check alpha has an effect
    out_01 = oi.summary_table(res, alpha=0.01)
    assert_(np.all(out_01[1][:, 6] <= out_05[1][:, 6]))
    assert_(np.all(out_01[1][:, 7] >= out_05[1][:, 7]))

    res2 = OLS(y, x[:,0]).fit()
    oi.summary_table(res2, alpha=0.05)
    infl = res2.get_influence()
    infl.summary_table()
def ols_sm(X_train, y_train, X_test):
    X_train = sm.add_constant(
        X_train)  # adds col of ones for intercept coefficient in OLS model
    ols = OLS(y_train, X_train).fit()
    # with open('ols_model_summary.csv', 'w') as f:
    #     f.write(ols.summary().as_csv())
    with open('ols_model_summary.txt', 'w') as f:
        f.write(ols.summary().as_text())

    # Plot True vs Predicted values to examine if linear model is a good fit
    fig = plt.figure(figsize=(12, 8))
    X_test = sm.add_constant(X_test)
    plt.scatter(y_test, ols.predict(X_test))
    plt.xlabel('True values')
    plt.ylabel('Predicted values')
    plt.title('True vs Predicted values')
    plt.show()
    plt.close()
    # Add quadratic term to X or take log of y to improve

    # Discern if a linear relationship exists with partial regression plots
    fig = plt.figure(figsize=(12, 8))
    fig = sm.graphics.plot_partregress_grid(ols, fig=fig)
    plt.title('Partial Regression Plots')
    plt.show()
    plt.close()

    # Identify outliers and high leverage points
    # a. Identify outliers (typically, those data points with studentized residuals outside of +/- 3 stdev).
    # Temporarily remove these from your data set and re-run your model.
    # Do your model metrics improve considerably? Does this give you cause for more confidence in your model?
    # b. Identify those outliers that are also high-leverage points (high residual and high leverage --> high influence).
    fig, ax = plt.subplots(figsize=(12, 8))
    fig = sm.graphics.influence_plot(ols, ax=ax, criterion="cooks")
    plt.show()
    fig, ax = plt.subplots(figsize=(8, 6))
    fig = sm.graphics.plot_leverage_resid2(ols, ax=ax)
    plt.show()
    plt.close()

    # Confirm homoscedasticity (i.e., constant variance of residual terms)
    # If residuals exhibit a “funnel shaped” effect, consider transforming your data into logarithmic space.
    studentized_residuals = ols.outlier_test()[:, 0]
    y_pred = ols.fittedvalues
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(y_pred, studentized_residuals)
    ax.axhline(y=0.0, color='k', ls='--')
    ax.set_xlabel('Predicted y')
    ax.set_ylabel('Studentized Residuals')
    plt.show()
    plt.close()

    # Test if residuals are normally distributed in QQ plot
    # plots quantile of the normal distribution against studentized residuals
    # if sample quantiles are normally distributed, the dots will align with 45 deg line
    fig, ax = plt.subplots()
    sm.graphics.qqplot(studentized_residuals, fit=True, line='45', ax=ax)
    plt.show()
    plt.close()

    # Find influencial points in data
    # DFBETAS - standardized measure of how much each coefficient changes when that observation is left out
    threshold = 2. / len(X_train)**.5
    infl = ols.get_influence()
    df = pd.DataFrame(infl.summary_frame().filter(regex="dfb"))
    inf = df[df > threshold].dropna(axis=0, how='all')
    print('Influencial points:\n', inf)