Example #1
0
def test_wrong_len_xname(reset_randomstate):
    y = np.random.randn(100)
    x = np.random.randn(100, 2)
    res = OLS(y, x).fit()
    with pytest.raises(ValueError):
        res.summary(xname=['x1'])
    with pytest.raises(ValueError):
        res.summary(xname=['x1', 'x2', 'x3'])
def backwardElimination(x, sl):
    numVars = len(x[0])
    for i in range(0, numVars):
        regressor_OLS = OLS(y1, x).fit()
        maxVar = max(regressor_OLS.pvalues)
        if maxVar > sl:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    x = np.delete(x, j, 1)
    regressor_OLS.summary()
    return x
Example #3
0
    def test_OLSsummary_rsquared_label(self):
        # Check that the "uncentered" label is correctly added after rsquared
        x = [1, 5, 7, 3, 5, 2, 5, 3]
        y = [6, 4, 2, 7, 4, 9, 10, 2]
        reg_with_constant = OLS(y, x, hasconst=True).fit()
        assert 'R-squared:' in str(reg_with_constant.summary2())
        assert 'R-squared:' in str(reg_with_constant.summary())

        reg_without_constant = OLS(y, x, hasconst=False).fit()
        assert 'R-squared (uncentered):' in str(reg_without_constant.summary2())
        assert 'R-squared (uncentered):' in str(reg_without_constant.summary())
Example #4
0
    def test_OLSsummary_rsquared_label(self):
        # Check that the "uncentered" label is correctly added after rsquared
        x = [1, 5, 7, 3, 5, 2, 5, 3]
        y = [6, 4, 2, 7, 4, 9, 10, 2]
        reg_with_constant = OLS(y, x, hasconst=True).fit()
        assert 'R-squared:' in str(reg_with_constant.summary2())
        assert 'R-squared:' in str(reg_with_constant.summary())

        reg_without_constant = OLS(y, x, hasconst=False).fit()
        assert 'R-squared (uncentered):' in str(
            reg_without_constant.summary2())
        assert 'R-squared (uncentered):' in str(reg_without_constant.summary())
Example #5
0
    def test_regression_with_tuples(self):
        i = pandas.Series([1, 2, 3, 4] * 10, name="i")
        y = pandas.Series([1, 2, 3, 4, 5] * 8, name="y")
        x = pandas.Series([1, 2, 3, 4, 5, 6, 7, 8] * 5, name="x")

        df = pandas.DataFrame(index=i.index)
        df = df.join(i)
        endo = df.join(y)
        exo = df.join(x)
        endo_groups = endo.groupby("i")
        exo_groups = exo.groupby("i")
        exo_df = exo_groups.agg([np.sum, np.max])
        endo_df = endo_groups.agg([np.sum, np.max])
        reg = OLS(exo_df[[("x", "sum")]], endo_df).fit()
        interesting_lines = []
        import warnings
        with warnings.catch_warnings():
            # Catch ominormal warning, not interesting here
            warnings.simplefilter("ignore")
            for line in str(reg.summary()).splitlines():
                if "_" in line:
                    interesting_lines.append(line[:38])

        desired = ["Dep. Variable:                  x_sum ",
                   "y_sum          1.4595      0.209      ",
                   "y_amax         0.2432      0.035      "]

        assert_equal(sorted(desired), sorted(interesting_lines))
Example #6
0
    def test_regression_with_tuples(self):
        i = pandas.Series([1, 2, 3, 4] * 10, name="i")
        y = pandas.Series([1, 2, 3, 4, 5] * 8, name="y")
        x = pandas.Series([1, 2, 3, 4, 5, 6, 7, 8] * 5, name="x")

        df = pandas.DataFrame(index=i.index)
        df = df.join(i)
        endo = df.join(y)
        exo = df.join(x)
        endo_groups = endo.groupby(("i", ))
        exo_groups = exo.groupby(("i", ))
        exo_Df = exo_groups.agg([np.sum, np.max])
        endo_Df = endo_groups.agg([np.sum, np.max])
        reg = OLS(exo_Df[[("x", "sum")]], endo_Df).fit()
        interesting_lines = []
        import warnings
        with warnings.catch_warnings():
            # Catch ominormal warning, not interesting here
            warnings.simplefilter("ignore")
            for line in str(reg.summary()).splitlines():
                if "('" in line:
                    interesting_lines.append(line[:38])

        desired = [
            "Dep. Variable:           ('x', 'sum') ",
            "('y', 'sum')      1.4595      0.209   ",
            "('y', 'amax')     0.2432      0.035   "
        ]

        self.assertEqual(sorted(desired), sorted(interesting_lines))
Example #7
0
def test_ols_summary_rsquared_label():
    # Check that the "uncentered" label is correctly added after rsquared
    x = [1, 5, 7, 3, 5, 2, 5, 3]
    y = [6, 4, 2, 7, 4, 9, 10, 2]
    reg_with_constant = OLS(y, add_constant(x)).fit()
    r2_str = 'R-squared:'
    with pytest.warns(UserWarning):
        assert r2_str in str(reg_with_constant.summary2())
    with pytest.warns(UserWarning):
        assert r2_str in str(reg_with_constant.summary())

    reg_without_constant = OLS(y, x, hasconst=False).fit()
    r2_str = 'R-squared (uncentered):'
    with pytest.warns(UserWarning):
        assert r2_str in str(reg_without_constant.summary2())
    with pytest.warns(UserWarning):
        assert r2_str in str(reg_without_constant.summary())
Example #8
0
def test_ols_summary_rsquared_label():
    # Check that the "uncentered" label is correctly added after rsquared
    x = [1, 5, 7, 3, 5, 2, 5, 3]
    y = [6, 4, 2, 7, 4, 9, 10, 2]
    reg_with_constant = OLS(y, add_constant(x)).fit()
    r2_str = 'R-squared:'
    with pytest.warns(UserWarning):
        assert r2_str in str(reg_with_constant.summary2())
    with pytest.warns(UserWarning):
        assert r2_str in str(reg_with_constant.summary())

    reg_without_constant = OLS(y, x, hasconst=False).fit()
    r2_str = 'R-squared (uncentered):'
    with pytest.warns(UserWarning):
        assert r2_str in str(reg_without_constant.summary2())
    with pytest.warns(UserWarning):
        assert r2_str in str(reg_without_constant.summary())
def test_fvalue_only_constant():
    # if only constant in model, return nan see #3642
    nobs = 20
    np.random.seed(2)
    x = np.ones(nobs)
    y = np.random.randn(nobs)

    from statsmodels.regression.linear_model import OLS, WLS

    res = OLS(y, x).fit(cov_type='hac', cov_kwds={'maxlags': 3})
    assert_(np.isnan(res.fvalue))
    assert_(np.isnan(res.f_pvalue))
    res.summary()

    res = WLS(y, x).fit(cov_type='HC1')
    assert_(np.isnan(res.fvalue))
    assert_(np.isnan(res.f_pvalue))
    res.summary()
Example #10
0
def test_fvalue_implicit_constant():
    nobs = 100
    np.random.seed(2)
    x = np.random.randn(nobs, 1)
    x = ((x > 0) == [True, False]).astype(int)
    y = x.sum(1) + np.random.randn(nobs)

    from statsmodels.regression.linear_model import OLS, WLS

    res = OLS(y, x).fit(cov_type='HC1')
    assert_(np.isnan(res.fvalue))
    assert_(np.isnan(res.f_pvalue))
    res.summary()

    res = WLS(y, x).fit(cov_type='HC1')
    assert_(np.isnan(res.fvalue))
    assert_(np.isnan(res.f_pvalue))
    res.summary()
Example #11
0
def test_fvalue_implicit_constant():
    nobs = 100
    np.random.seed(2)
    x = np.random.randn(nobs, 1)
    x = ((x > 0) == [True, False]).astype(int)
    y = x.sum(1) + np.random.randn(nobs)

    from statsmodels.regression.linear_model import OLS, WLS

    res = OLS(y, x).fit(cov_type='HC1')
    assert_(np.isnan(res.fvalue))
    assert_(np.isnan(res.f_pvalue))
    res.summary()

    res = WLS(y, x).fit(cov_type='HC1')
    assert_(np.isnan(res.fvalue))
    assert_(np.isnan(res.f_pvalue))
    res.summary()
Example #12
0
def test_fvalue_only_constant():
    # if only constant in model, return nan see #3642
    nobs = 20
    np.random.seed(2)
    x = np.ones(nobs)
    y = np.random.randn(nobs)

    from statsmodels.regression.linear_model import OLS, WLS

    res = OLS(y, x).fit(cov_type='hac', cov_kwds={'maxlags': 3})
    assert_(np.isnan(res.fvalue))
    assert_(np.isnan(res.f_pvalue))
    res.summary()

    res = WLS(y, x).fit(cov_type='HC1')
    assert_(np.isnan(res.fvalue))
    assert_(np.isnan(res.f_pvalue))
    res.summary()
def test_summary_as_latex():
    # GH#734
    import re
    dta = longley.load_pandas()
    X = dta.exog
    X["constant"] = 1
    y = dta.endog
    res = OLS(y, X).fit()
    with pytest.warns(UserWarning):
        table = res.summary().as_latex()
    # replace the date and time
    table = re.sub("(?<=\n\\\\textbf\\{Date:\\}             &).+?&",
                   " Sun, 07 Apr 2013 &", table)
    table = re.sub("(?<=\n\\\\textbf\\{Time:\\}             &).+?&",
                   "     13:46:07     &", table)

    expected = """\\begin{center}
\\begin{tabular}{lclc}
\\toprule
\\textbf{Dep. Variable:}    &      TOTEMP      & \\textbf{  R-squared:         } &     0.995   \\\\
\\textbf{Model:}            &       OLS        & \\textbf{  Adj. R-squared:    } &     0.992   \\\\
\\textbf{Method:}           &  Least Squares   & \\textbf{  F-statistic:       } &     330.3   \\\\
\\textbf{Date:}             & Sun, 07 Apr 2013 & \\textbf{  Prob (F-statistic):} &  4.98e-10   \\\\
\\textbf{Time:}             &     13:46:07     & \\textbf{  Log-Likelihood:    } &   -109.62   \\\\
\\textbf{No. Observations:} &          16      & \\textbf{  AIC:               } &     233.2   \\\\
\\textbf{Df Residuals:}     &           9      & \\textbf{  BIC:               } &     238.6   \\\\
\\textbf{Df Model:}         &           6      & \\textbf{                     } &             \\\\
\\bottomrule
\\end{tabular}
\\begin{tabular}{lcccccc}
                  & \\textbf{coef} & \\textbf{std err} & \\textbf{t} & \\textbf{P$> |$t$|$} & \\textbf{[0.025} & \\textbf{0.975]}  \\\\
\\midrule
\\textbf{GNPDEFL}  &      15.0619  &       84.915     &     0.177  &         0.863        &     -177.029    &      207.153     \\\\
\\textbf{GNP}      &      -0.0358  &        0.033     &    -1.070  &         0.313        &       -0.112    &        0.040     \\\\
\\textbf{UNEMP}    &      -2.0202  &        0.488     &    -4.136  &         0.003        &       -3.125    &       -0.915     \\\\
\\textbf{ARMED}    &      -1.0332  &        0.214     &    -4.822  &         0.001        &       -1.518    &       -0.549     \\\\
\\textbf{POP}      &      -0.0511  &        0.226     &    -0.226  &         0.826        &       -0.563    &        0.460     \\\\
\\textbf{YEAR}     &    1829.1515  &      455.478     &     4.016  &         0.003        &      798.788    &     2859.515     \\\\
\\textbf{constant} &   -3.482e+06  &      8.9e+05     &    -3.911  &         0.004        &     -5.5e+06    &    -1.47e+06     \\\\
\\bottomrule
\\end{tabular}
\\begin{tabular}{lclc}
\\textbf{Omnibus:}       &  0.749 & \\textbf{  Durbin-Watson:     } &    2.559  \\\\
\\textbf{Prob(Omnibus):} &  0.688 & \\textbf{  Jarque-Bera (JB):  } &    0.684  \\\\
\\textbf{Skew:}          &  0.420 & \\textbf{  Prob(JB):          } &    0.710  \\\\
\\textbf{Kurtosis:}      &  2.434 & \\textbf{  Cond. No.          } & 4.86e+09  \\\\
\\bottomrule
\\end{tabular}
%\\caption{OLS Regression Results}
\\end{center}

Warnings: \\newline
 [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. \\newline
 [2] The condition number is large, 4.86e+09. This might indicate that there are \\newline
 strong multicollinearity or other numerical problems."""
    assert_equal(table, expected)
Example #14
0
def test_summary_as_latex():
    # GH#734
    import re
    dta = longley.load_pandas()
    X = dta.exog
    X["constant"] = 1
    y = dta.endog
    res = OLS(y, X).fit()
    with pytest.warns(UserWarning):
        table = res.summary().as_latex()
    # replace the date and time
    table = re.sub("(?<=\n\\\\textbf\\{Date:\\}             &).+?&",
                   " Sun, 07 Apr 2013 &", table)
    table = re.sub("(?<=\n\\\\textbf\\{Time:\\}             &).+?&",
                   "     13:46:07     &", table)

    expected = """\\begin{center}
\\begin{tabular}{lclc}
\\toprule
\\textbf{Dep. Variable:}    &      TOTEMP      & \\textbf{  R-squared:         } &     0.995   \\\\
\\textbf{Model:}            &       OLS        & \\textbf{  Adj. R-squared:    } &     0.992   \\\\
\\textbf{Method:}           &  Least Squares   & \\textbf{  F-statistic:       } &     330.3   \\\\
\\textbf{Date:}             & Sun, 07 Apr 2013 & \\textbf{  Prob (F-statistic):} &  4.98e-10   \\\\
\\textbf{Time:}             &     13:46:07     & \\textbf{  Log-Likelihood:    } &   -109.62   \\\\
\\textbf{No. Observations:} &          16      & \\textbf{  AIC:               } &     233.2   \\\\
\\textbf{Df Residuals:}     &           9      & \\textbf{  BIC:               } &     238.6   \\\\
\\textbf{Df Model:}         &           6      & \\textbf{                     } &             \\\\
\\bottomrule
\\end{tabular}
\\begin{tabular}{lcccccc}
                  & \\textbf{coef} & \\textbf{std err} & \\textbf{t} & \\textbf{P$> |$t$|$} & \\textbf{[0.025} & \\textbf{0.975]}  \\\\
\\midrule
\\textbf{GNPDEFL}  &      15.0619  &       84.915     &     0.177  &         0.863        &     -177.029    &      207.153     \\\\
\\textbf{GNP}      &      -0.0358  &        0.033     &    -1.070  &         0.313        &       -0.112    &        0.040     \\\\
\\textbf{UNEMP}    &      -2.0202  &        0.488     &    -4.136  &         0.003        &       -3.125    &       -0.915     \\\\
\\textbf{ARMED}    &      -1.0332  &        0.214     &    -4.822  &         0.001        &       -1.518    &       -0.549     \\\\
\\textbf{POP}      &      -0.0511  &        0.226     &    -0.226  &         0.826        &       -0.563    &        0.460     \\\\
\\textbf{YEAR}     &    1829.1515  &      455.478     &     4.016  &         0.003        &      798.788    &     2859.515     \\\\
\\textbf{constant} &   -3.482e+06  &      8.9e+05     &    -3.911  &         0.004        &     -5.5e+06    &    -1.47e+06     \\\\
\\bottomrule
\\end{tabular}
\\begin{tabular}{lclc}
\\textbf{Omnibus:}       &  0.749 & \\textbf{  Durbin-Watson:     } &    2.559  \\\\
\\textbf{Prob(Omnibus):} &  0.688 & \\textbf{  Jarque-Bera (JB):  } &    0.684  \\\\
\\textbf{Skew:}          &  0.420 & \\textbf{  Prob(JB):          } &    0.710  \\\\
\\textbf{Kurtosis:}      &  2.434 & \\textbf{  Cond. No.          } & 4.86e+09  \\\\
\\bottomrule
\\end{tabular}
%\\caption{OLS Regression Results}
\\end{center}

Warnings: \\newline
 [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. \\newline
 [2] The condition number is large, 4.86e+09. This might indicate that there are \\newline
 strong multicollinearity or other numerical problems."""
    assert_equal(table, expected)
Example #15
0
def create_linear_model(X_train, X_test, Y_train, Y_test):
    ''' TODO...
        - Predict the wine quality using the test set and compare the accuracy to the actual quality. Comment.
        - Print the parameter estimates and their 95% confidence intervals in a single table. (Suggest using
          confint()), and cbind()
    '''

    X_train = add_constant(X_train)
    regressionResult = OLS(Y_train, X_train).fit()
    print(regressionResult.summary())

    # Print various attributes of the OLS fitted model
    # print("R Squared: {}".format(regressionResult.rsquared))
    # print("SSE: {}".format(regressionResult.ess))
    # print("SSR: {}".format(regressionResult.ssr))
    # print("Residual MSE: {}".format(regressionResult.mse_resid))
    # print("Total MSE: {}".format(regressionResult.mse_total))
    # print("Model MSE: {}".format(regressionResult.mse_model))
    # print("F-Value: {}".format(regressionResult.mse_model/regressionResult.mse_resid))
    # print("NOBS: {}".format(regressionResult.nobs))
    # print("Centered TSS: {}".format(regressionResult.centered_tss))
    # print("Uncentered TSS: {}".format(regressionResult.uncentered_tss))
    # print("DF Model: {}".format(regressionResult.df_model))
    # print("DF Resid: {}".format(regressionResult.df_resid))
    # print("Standard Errors: {}".format(regressionResult.bse))
    print("Confidence: {}".format(regressionResult.conf_int()))

    predictions = regressionResult.predict(X_train)

    nobs, p = X_train.shape
    eaic = extractAIC(nobs, p, Y_train, predictions)
    print("Extract AIC: {}".format(eaic))

    params = regressionResult.params

    # n, p = X_test.shape
    # X_test = add_constant(X_test)
    # predictions = X_test.dot(params).reshape(n,1)

    # num_matches = 0
    # for i in range(len(Y_test)):
    #     p = int(round(predictions[i][0], 0))
    #     is_match = (Y_test[i] == p)

    #     if is_match:
    #         num_matches += 1

    #     print("Actual: {}, Predictions: {}... Match: {}".format(Y_test[i], p, is_match))

    # print("Number of matches: {}, Total number of Instances: {}".format(num_matches, n))
    # print("Percent correct guesses: {}%".format(round((num_matches/n)*100, 3)))

    return params
Example #16
0
def test_summary():
    # test 734
    import re
    dta = longley.load_pandas()
    X = dta.exog
    X["constant"] = 1
    y = dta.endog
    with warnings.catch_warnings(record=True):
        res = OLS(y, X).fit()
        table = res.summary().as_latex()
    # replace the date and time
    table = re.sub("(?<=\n\\\\textbf\{Date:\}             &).+?&",
                   " Sun, 07 Apr 2013 &", table)
    table = re.sub("(?<=\n\\\\textbf\{Time:\}             &).+?&",
                   "     13:46:07     &", table)

    expected = """\\begin{center}
\\begin{tabular}{lclc}
\\toprule
\\textbf{Dep. Variable:}    &      TOTEMP      & \\textbf{  R-squared:         } &     0.995   \\\\
\\textbf{Model:}            &       OLS        & \\textbf{  Adj. R-squared:    } &     0.992   \\\\
\\textbf{Method:}           &  Least Squares   & \\textbf{  F-statistic:       } &     330.3   \\\\
\\textbf{Date:}             & Sun, 07 Apr 2013 & \\textbf{  Prob (F-statistic):} &  4.98e-10   \\\\
\\textbf{Time:}             &     13:46:07     & \\textbf{  Log-Likelihood:    } &   -109.62   \\\\
\\textbf{No. Observations:} &          16      & \\textbf{  AIC:               } &     233.2   \\\\
\\textbf{Df Residuals:}     &           9      & \\textbf{  BIC:               } &     238.6   \\\\
\\textbf{Df Model:}         &           6      & \\textbf{                     } &             \\\\
\\bottomrule
\\end{tabular}
\\begin{tabular}{lccccc}
                  & \\textbf{coef} & \\textbf{std err} & \\textbf{t} & \\textbf{P$>$$|$t$|$} & \\textbf{[95.0\\% Conf. Int.]}  \\\\
\\midrule
\\textbf{GNPDEFL}  &      15.0619  &       84.915     &     0.177  &         0.863        &      -177.029   207.153       \\\\
\\textbf{GNP}      &      -0.0358  &        0.033     &    -1.070  &         0.313        &        -0.112     0.040       \\\\
\\textbf{UNEMP}    &      -2.0202  &        0.488     &    -4.136  &         0.003        &        -3.125    -0.915       \\\\
\\textbf{ARMED}    &      -1.0332  &        0.214     &    -4.822  &         0.001        &        -1.518    -0.549       \\\\
\\textbf{POP}      &      -0.0511  &        0.226     &    -0.226  &         0.826        &        -0.563     0.460       \\\\
\\textbf{YEAR}     &    1829.1515  &      455.478     &     4.016  &         0.003        &       798.788  2859.515       \\\\
\\textbf{constant} &   -3.482e+06  &      8.9e+05     &    -3.911  &         0.004        &      -5.5e+06 -1.47e+06       \\\\
\\bottomrule
\\end{tabular}
\\begin{tabular}{lclc}
\\textbf{Omnibus:}       &  0.749 & \\textbf{  Durbin-Watson:     } &    2.559  \\\\
\\textbf{Prob(Omnibus):} &  0.688 & \\textbf{  Jarque-Bera (JB):  } &    0.684  \\\\
\\textbf{Skew:}          &  0.420 & \\textbf{  Prob(JB):          } &    0.710  \\\\
\\textbf{Kurtosis:}      &  2.434 & \\textbf{  Cond. No.          } & 4.86e+09  \\\\
\\bottomrule
\\end{tabular}
%\\caption{OLS Regression Results}
\\end{center}"""
    assert_equal(table, expected)
Example #17
0
def test_summary():
    # test 734
    import re
    dta = longley.load_pandas()
    X = dta.exog
    X["constant"] = 1
    y = dta.endog
    with warnings.catch_warnings(record=True):
        res = OLS(y, X).fit()
        table = res.summary().as_latex()
    # replace the date and time
    table = re.sub("(?<=\n\\\\textbf\{Date:\}             &).+?&",
                   " Sun, 07 Apr 2013 &", table)
    table = re.sub("(?<=\n\\\\textbf\{Time:\}             &).+?&",
                   "     13:46:07     &", table)

    expected = """\\begin{center}
\\begin{tabular}{lclc}
\\toprule
\\textbf{Dep. Variable:}    &      TOTEMP      & \\textbf{  R-squared:         } &     0.995   \\\\
\\textbf{Model:}            &       OLS        & \\textbf{  Adj. R-squared:    } &     0.992   \\\\
\\textbf{Method:}           &  Least Squares   & \\textbf{  F-statistic:       } &     330.3   \\\\
\\textbf{Date:}             & Sun, 07 Apr 2013 & \\textbf{  Prob (F-statistic):} &  4.98e-10   \\\\
\\textbf{Time:}             &     13:46:07     & \\textbf{  Log-Likelihood:    } &   -109.62   \\\\
\\textbf{No. Observations:} &          16      & \\textbf{  AIC:               } &     233.2   \\\\
\\textbf{Df Residuals:}     &           9      & \\textbf{  BIC:               } &     238.6   \\\\
\\textbf{Df Model:}         &           6      & \\textbf{                     } &             \\\\
\\bottomrule
\\end{tabular}
\\begin{tabular}{lccccc}
                  & \\textbf{coef} & \\textbf{std err} & \\textbf{t} & \\textbf{P$>$$|$t$|$} & \\textbf{[95.0\\% Conf. Int.]}  \\\\
\\midrule
\\textbf{GNPDEFL}  &      15.0619  &       84.915     &     0.177  &         0.863        &      -177.029   207.153       \\\\
\\textbf{GNP}      &      -0.0358  &        0.033     &    -1.070  &         0.313        &        -0.112     0.040       \\\\
\\textbf{UNEMP}    &      -2.0202  &        0.488     &    -4.136  &         0.003        &        -3.125    -0.915       \\\\
\\textbf{ARMED}    &      -1.0332  &        0.214     &    -4.822  &         0.001        &        -1.518    -0.549       \\\\
\\textbf{POP}      &      -0.0511  &        0.226     &    -0.226  &         0.826        &        -0.563     0.460       \\\\
\\textbf{YEAR}     &    1829.1515  &      455.478     &     4.016  &         0.003        &       798.788  2859.515       \\\\
\\textbf{constant} &   -3.482e+06  &      8.9e+05     &    -3.911  &         0.004        &      -5.5e+06 -1.47e+06       \\\\
\\bottomrule
\\end{tabular}
\\begin{tabular}{lclc}
\\textbf{Omnibus:}       &  0.749 & \\textbf{  Durbin-Watson:     } &    2.559  \\\\
\\textbf{Prob(Omnibus):} &  0.688 & \\textbf{  Jarque-Bera (JB):  } &    0.684  \\\\
\\textbf{Skew:}          &  0.420 & \\textbf{  Prob(JB):          } &    0.710  \\\\
\\textbf{Kurtosis:}      &  2.434 & \\textbf{  Cond. No.          } & 4.86e+09  \\\\
\\bottomrule
\\end{tabular}
%\\caption{OLS Regression Results}
\\end{center}"""
    assert_equal(table, expected)
Example #18
0
    def test_OLSsummary(self):
        # Test that latex output of regular OLS output still contains
        # multiple tables

        x = [1,5,7,3,5]
        x = add_constant(x)
        y1 = [6,4,2,7,4]
        reg1 = OLS(y1,x).fit()
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            actual = reg1.summary().as_latex()
        string_to_find = r'''\end{tabular}
\begin{tabular}'''
        result = string_to_find in actual
        assert(result is True)
Example #19
0
    def test_OLSsummary(self):
        # Test that latex output of regular OLS output still contains
        # multiple tables

        x = [1, 5, 7, 3, 5]
        x = add_constant(x)
        y1 = [6, 4, 2, 7, 4]
        reg1 = OLS(y1, x).fit()
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            actual = reg1.summary().as_latex()
        string_to_find = r'''\end{tabular}
\begin{tabular}'''
        result = string_to_find in actual
        assert (result is True)
Example #20
0
    def split(self, X, Y, seed=None, max_splits=1000):
        """
        splitting function

        Parameters
        -----------
        X: design matrix (not actually needed but taken for consistency)
        Y: outcome variable
        seed: random seed (default: None, uses system time)
        max_splits: int, maximum number of splits to try before failing
        """

        np.random.seed(seed)
        nsubs = len(Y)
        # cycle through until we find a split that is good enough
        runctr = 0
        best_pval = 0
        while True:
            runctr += 1
            cv = KFold(n_splits=self.nfolds, shuffle=True)

            idx = np.zeros((nsubs, self.nfolds))  # this is the design matrix
            folds = []
            ctr = 0
            # create design matrix for anova across folds
            for train, test in cv.split(Y):
                idx[test, ctr] = 1
                folds.append([train, test])
                ctr += 1

            # fit anova model, comparing means of Y across folds
            lm_y = OLS(Y - np.mean(Y), idx).fit()

            if lm_y.f_pvalue > best_pval:
                best_pval = lm_y.f_pvalue
                best_folds = folds

            if lm_y.f_pvalue > self.pthresh:
                if self.verbose:
                    print(lm_y.summary())
                return iter(folds)

            if runctr > max_splits:
                print('no sufficient split found, returning best (p=%f)' %
                      best_pval)  # noqa
                return iter(best_folds)
Example #21
0
    def test__repr_latex_(self):
        desired = r'''
\begin{center}
\begin{tabular}{lcccccc}
\toprule
               & \textbf{coef} & \textbf{std err} & \textbf{t} & \textbf{P$> |$t$|$} & \textbf{[0.025} & \textbf{0.975]}  \\
\midrule
\textbf{const} &       7.2248  &        0.866     &     8.346  &         0.000        &        5.406    &        9.044     \\
\textbf{x1}    &      -0.6609  &        0.177     &    -3.736  &         0.002        &       -1.033    &       -0.289     \\
\bottomrule
\end{tabular}
\end{center}
'''
        x = [1, 5, 7, 3, 5, 5, 8, 3, 3, 4, 6, 4, 2, 7, 4, 2, 1, 9, 2, 6]
        x = add_constant(x)
        y = [6, 4, 2, 7, 4, 2, 1, 9, 2, 6, 1, 5, 7, 3, 5, 5, 8, 3, 3, 4]
        reg = OLS(y, x).fit()

        actual = reg.summary().tables[1]._repr_latex_()
        actual = '\n%s\n' % actual
        assert_equal(actual, desired)
Example #22
0
    def split(self, X, Y, max_splits=1000):
        """
        - we don't actually need X but we take it for consistency
        """

        nsubs = len(Y)

        # cycle through until we find a split that is good enough

        runctr = 0
        best_pval = 0.
        while 1:
            runctr += 1
            cv = KFold(n_splits=self.nfolds, shuffle=True)

            idx = N.zeros((nsubs, self.nfolds))  # this is the design matrix
            folds = []
            ctr = 0
            for train, test in cv.split(Y):
                idx[test, ctr] = 1
                folds.append([train, test])
                ctr += 1

            lm_y = OLS(Y - N.mean(Y), idx).fit()

            if lm_y.f_pvalue > best_pval:
                best_pval = lm_y.f_pvalue
                best_folds = folds

            if lm_y.f_pvalue > self.pthresh:
                if self.verbose:
                    print(lm_y.summary())
                return iter(folds)

            if runctr > max_splits:
                print('no sufficient split found, returning best (p=%f)' %
                      best_pval)
                return iter(best_folds)
Example #23
0
def regression(aspects, dataset):
    asps = list(set(dataset.columns).intersection(aspects))
    asps.sort()
    
    aspsMP = list()
    for asp in asps:
        minus = asp+'_minus'
        dataset[minus] = dataset.apply(lambda x: 1 if x[asp] and x[asp+'sent'] == -1 else 0, axis=1)    
        aspsMP.append(minus)
         
        plus = asp+'_plus'
        dataset[plus] = dataset.apply(lambda x: 1 if x[asp] and x[asp+'sent'] == 1 else 0, axis=1)    
        aspsMP.append(plus)
         
#         overall = 'a_'+asp
#         dataset[overall] = dataset.apply(lambda x: x[asp]*x[asp+'sent'], axis=1)    
#         aspsMP.append(overall)

        neutral = asp+'_neutral'
        dataset[neutral] = dataset.apply(lambda x: 1 if x[asp] and x[asp+'sent'] == 0 else 0, axis=1)    
        aspsMP.append(neutral)
        
#         aspsMP.append(asp+'sent')
        
        
#     MINUS
#     PLUS
    
    aspsMP.sort()
    dataset['intercept'] = np.ones(len(dataset))
    aspsMP = ['intercept'] + aspsMP
#     print(len(aspects),len(asps))
    model = OLS(dataset['stars'], dataset[aspsMP]).fit()
#     model.summary
#     print(model.params)
#     print(model.pvalues)
    
    return model.summary()
    def split(self,X,Y,max_splits=1000):
        """
        - we don't actually need X but we take it for consistency
        """

        nsubs=len(Y)

        # cycle through until we find a split that is good enough

        runctr=0
        best_pval=0.
        while 1:
            runctr+=1
            cv=KFold(n_splits=self.nfolds,shuffle=True)

            idx=N.zeros((nsubs,self.nfolds)) # this is the design matrix
            folds=[]
            ctr=0
            for train,test in cv.split(Y):
                idx[test,ctr]=1
                folds.append([train,test])
                ctr+=1

            lm_y=OLS(Y-N.mean(Y),idx).fit()

            if lm_y.f_pvalue>best_pval:
                best_pval=lm_y.f_pvalue
                best_folds=folds

            if lm_y.f_pvalue>self.pthresh:
                if self.verbose:
                    print(lm_y.summary())
                return iter(folds)

            if runctr>max_splits:
                print('no sufficient split found, returning best (p=%f)'%best_pval)
                return iter(best_folds)
Example #25
0
    def test_regression_with_tuples(self):
        i = pandas.Series( [1,2,3,4]*10 , name="i")
        y = pandas.Series( [1,2,3,4,5]*8, name="y")
        x = pandas.Series( [1,2,3,4,5,6,7,8]*5, name="x")

        df = pandas.DataFrame( index=i.index )
        df = df.join( i )
        endo = df.join( y )
        exo = df.join( x )
        endo_groups = endo.groupby( ("i",) )
        exo_groups = exo.groupby( ("i",) )
        exo_Df = exo_groups.agg( [np.sum, np.max] )
        endo_Df = endo_groups.agg( [np.sum, np.max] )
        reg = OLS(exo_Df[[("x", "sum")]],endo_Df).fit()
        interesting_lines = []
        for line in str( reg.summary() ).splitlines():
            if "('" in line:
                interesting_lines.append( line[:38] )
        
        desired = ["Dep. Variable:           ('x', 'sum') ",
                   "('y', 'sum')      1.4595      0.209   ",
                   "('y', 'amax')     0.2432      0.035   "]
        
        self.assertEqual( desired, interesting_lines  )
Example #26
0
import numpy as np
from statsmodels.regression.linear_model import OLS, GLSAR
from statsmodels.tools.tools import add_constant
from statsmodels.datasets import macrodata
import statsmodels.regression.tests.results.results_macro_ols_robust as res

d2 = macrodata.load().data
g_gdp = 400 * np.diff(np.log(d2['realgdp']))
g_inv = 400 * np.diff(np.log(d2['realinv']))
exogg = add_constant(np.c_[g_gdp, d2['realint'][:-1]], prepend=False)
res_olsg = OLS(g_inv, exogg).fit()

print res_olsg.summary()
res_hc0 = res_olsg.get_robustcov_results('HC1')
print '\n\n'
print res_hc0.summary()
print '\n\n'
res_hac4 = res_olsg.get_robustcov_results('HAC',
                                          maxlags=4,
                                          use_correction=True)
print res_hac4.summary()

print '\n\n'
tt = res_hac4.t_test(np.eye(len(res_hac4.params)))
print tt.summary()
print '\n\n'
print tt.summary_frame()

res_hac4.use_t = False

print '\n\n'
"""
Run this after you've run benchmarks.py to get the /tmp/runtimes.csv data
"""
import pandas as pd
from statsmodels.regression.linear_model import OLS

df = pd.read_csv('/tmp/runtimes.csv')
df['constant'] = 1
df['resolution_sq'] = df['resolution']**2
quad_model = OLS(df['avg_time'],
                 df[['resolution_sq', 'resolution', 'constant']]).fit()
quad_model.summary()
Example #28
0
import numpy as np
from statsmodels.regression.linear_model import OLS, GLSAR
from statsmodels.tools.tools import add_constant
from statsmodels.datasets import macrodata
import statsmodels.regression.tests.results.results_macro_ols_robust as res


d2 = macrodata.load().data
g_gdp = 400*np.diff(np.log(d2['realgdp']))
g_inv = 400*np.diff(np.log(d2['realinv']))
exogg = add_constant(np.c_[g_gdp, d2['realint'][:-1]], prepend=False)
res_olsg = OLS(g_inv, exogg).fit()



print res_olsg.summary()
res_hc0 = res_olsg.get_robustcov_results('HC1')
print '\n\n'
print res_hc0.summary()
print '\n\n'
res_hac4 = res_olsg.get_robustcov_results('HAC', maxlags=4, use_correction=True)
print res_hac4.summary()


print '\n\n'
tt = res_hac4.t_test(np.eye(len(res_hac4.params)))
print tt.summary()
print '\n\n'
print tt.summary_frame()

res_hac4.use_t = False
y_train=standardscaler.fit_transform(y_train)
y_test=standardscaler.transform(y_test)"""

#multiple linear algo
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor = regressor.fit(x_train, y_train)

y_pred = regressor.predict(x_test)

#backward elimination method
from statsmodels.regression.linear_model import OLS
x = np.append(arr=np.ones((50, 1)).astype(int), values=x, axis=1)
x_opt = x[:, :6]
regressor_OLS = OLS(endog=y, exog=x_opt).fit()
regressor_OLS.summary()

x_opt = x[:, [0, 3]]
regressor_OLS = OLS(endog=y, exog=x_opt).fit()
regressor_OLS.summary()

#splitting dataset into train and test dataset
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_opt,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

#multiple linear algo
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
Example #30
0
    def fit_ols(self):

        self.data_lag.loc[self.data_lag.fecha <= "2020-04-04", "days"] = 30
        ts_ols = OLS(
            self.data_lag.iloc[:-1, ].fallecimientos,
            self.data_lag.iloc[:-1, ].drop(["fecha", "fallecimientos"],
                                           axis=1)).fit()
        sum = ts_ols.summary()
        predictions = pd.DataFrame(
            ts_ols.predict(self.forecast.drop("fecha", axis=1)))

        e = pd.DataFrame({
            "Modelo":
            "OLS",
            "Predicción de hoy": [predictions.iloc[0, 0]],
            "Error de hoy": [
                abs(predictions.iloc[0, 0] -
                    self.dt.loc[len(self.dt) - 1, "fallecimientos"])
            ]
        })

        predictions["fecha"] = self.dt.loc[len(self.dt) - 1, "fecha"]
        predictions.columns = ["fallecimientos", "fecha"]
        predictions.reset_index(drop=True, inplace=True)
        for i in range(len(self.forecast)):
            c = 0
            c += i
            predictions.loc[i,
                            "fecha"] = predictions.fecha[i] + timedelta(days=c)

        new = pd.concat(
            (self.dt[["fallecimientos", "fecha"]], predictions.iloc[1:, :]),
            axis=0)

        new["Predicciones"] = np.where(
            new.fecha <= self.dt.loc[len(self.dt) - 1, "fecha"], "Real",
            "Pred")

        fig = px.bar(
            new,
            x="fecha",
            y="fallecimientos",
            color="Predicciones",
        )

        # predictions.columns =["Predicciones_Fallecimientos", "fecha"]
        #
        # load = str(self.dt.loc[len(self.dt)-1, "fecha"] - timedelta(days=1))
        # load = load[0:10] + ".pkl"
        #
        # with open(load, "rb") as file:
        #     historic = pickle.load(file)
        # predictions["Error"] = 0
        # p=pd.concat([predictions.reset_index(drop=True), historic], ignore_index=True)
        # p = p.loc[p.fecha <= self.dt.loc[len(self.dt)-1, "fecha"],:]
        # p.reset_index(drop=True, inplace=True)
        # for i in range(0,len(p)):
        #     if self.dt.loc[len(self.dt)-1,"fecha"] == p.loc[i,"fecha"]:
        #         p.loc[i,"Error"] = np.sqrt((self.dt.loc[len(self.dt)-1,"fallecimientos"] - p.loc[i,"Predicciones_Fallecimientos"])**2)
        #
        # save = str(self.dt.loc[len(self.dt)-1, "fecha"])
        # save = save[0:10] + ".pkl"
        #
        # with open(save, "wb") as file:
        #     pickle.dump(p, file)

        return e, fig, sum
Example #31
0
import numpy as np
from statsmodels.regression.linear_model import OLS, GLSAR
from statsmodels.tools.tools import add_constant
from statsmodels.datasets import macrodata
import statsmodels.regression.tests.results.results_macro_ols_robust as res


d2 = macrodata.load(as_pandas=False).data
g_gdp = 400*np.diff(np.log(d2['realgdp']))
g_inv = 400*np.diff(np.log(d2['realinv']))
exogg = add_constant(np.c_[g_gdp, d2['realint'][:-1]], prepend=False)
res_olsg = OLS(g_inv, exogg).fit()



print(res_olsg.summary())
res_hc0 = res_olsg.get_robustcov_results('HC1')
print('\n\n')
print(res_hc0.summary())
print('\n\n')
res_hac4 = res_olsg.get_robustcov_results('HAC', maxlags=4, use_correction=True)
print(res_hac4.summary())


print('\n\n')
tt = res_hac4.t_test(np.eye(len(res_hac4.params)))
print(tt.summary())
print('\n\n')
print(tt.summary_frame())

res_hac4.use_t = False
def ols_sm(X_train, y_train, X_test):
    X_train = sm.add_constant(
        X_train)  # adds col of ones for intercept coefficient in OLS model
    ols = OLS(y_train, X_train).fit()
    # with open('ols_model_summary.csv', 'w') as f:
    #     f.write(ols.summary().as_csv())
    with open('ols_model_summary.txt', 'w') as f:
        f.write(ols.summary().as_text())

    # Plot True vs Predicted values to examine if linear model is a good fit
    fig = plt.figure(figsize=(12, 8))
    X_test = sm.add_constant(X_test)
    plt.scatter(y_test, ols.predict(X_test))
    plt.xlabel('True values')
    plt.ylabel('Predicted values')
    plt.title('True vs Predicted values')
    plt.show()
    plt.close()
    # Add quadratic term to X or take log of y to improve

    # Discern if a linear relationship exists with partial regression plots
    fig = plt.figure(figsize=(12, 8))
    fig = sm.graphics.plot_partregress_grid(ols, fig=fig)
    plt.title('Partial Regression Plots')
    plt.show()
    plt.close()

    # Identify outliers and high leverage points
    # a. Identify outliers (typically, those data points with studentized residuals outside of +/- 3 stdev).
    # Temporarily remove these from your data set and re-run your model.
    # Do your model metrics improve considerably? Does this give you cause for more confidence in your model?
    # b. Identify those outliers that are also high-leverage points (high residual and high leverage --> high influence).
    fig, ax = plt.subplots(figsize=(12, 8))
    fig = sm.graphics.influence_plot(ols, ax=ax, criterion="cooks")
    plt.show()
    fig, ax = plt.subplots(figsize=(8, 6))
    fig = sm.graphics.plot_leverage_resid2(ols, ax=ax)
    plt.show()
    plt.close()

    # Confirm homoscedasticity (i.e., constant variance of residual terms)
    # If residuals exhibit a “funnel shaped” effect, consider transforming your data into logarithmic space.
    studentized_residuals = ols.outlier_test()[:, 0]
    y_pred = ols.fittedvalues
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(y_pred, studentized_residuals)
    ax.axhline(y=0.0, color='k', ls='--')
    ax.set_xlabel('Predicted y')
    ax.set_ylabel('Studentized Residuals')
    plt.show()
    plt.close()

    # Test if residuals are normally distributed in QQ plot
    # plots quantile of the normal distribution against studentized residuals
    # if sample quantiles are normally distributed, the dots will align with 45 deg line
    fig, ax = plt.subplots()
    sm.graphics.qqplot(studentized_residuals, fit=True, line='45', ax=ax)
    plt.show()
    plt.close()

    # Find influencial points in data
    # DFBETAS - standardized measure of how much each coefficient changes when that observation is left out
    threshold = 2. / len(X_train)**.5
    infl = ols.get_influence()
    df = pd.DataFrame(infl.summary_frame().filter(regex="dfb"))
    inf = df[df > threshold].dropna(axis=0, how='all')
    print('Influencial points:\n', inf)
Example #33
0
"""Try a Multinomial LogisticRegression"""
from sklearn.linear_model import LogisticRegression, LinearRegression, ElasticNet, Ridge
from statsmodels.discrete.discrete_model import MNLogit
from statsmodels.regression.linear_model import OLS
# we use the full df for the regression because we want to weight results by the
# existence of different ads in different neighborhoods, not just unique addresses
X = df[["black_proportion","log_income","asian_proportion","latinx_proportion","log_price"]]
y = df.white_proportion
df_tmp = df.copy()
df_tmp[list(range(30))] = df_tmp[list(range(30))].where(df_tmp[list(range(30))]>.1,0)
topic_0 + topic_7 + topic_8 + topic_9 + topic_12  + topic_14 + topic_16 + topic_17+ topic_20 + topic_23 + topic_24 + topic_25  + topic_28
X = df[[str(x) for x in [0,7,8,9,12,14,16,17,20,23,24,25,28]]+["black_proportion","log_income","log_price","total_RE"]]
y = np.where(df['white_proportion']>np.median(df['white_proportion']),1,0)
y= df['income']
OLR = OLS(y,X).fit()
OLR.summary()
OLR.predict(exog=X)

df_full_results.params.sort_values()
df_results.params.sort_values()
df_results.summary()
EN = ElasticNet(alpha = .02, l1_ratio=.001)
EN.fit(X,y)
EN.score(X,y)
EN.predict(X)
LinR = LinearRegression()
LinR.fit(X,y)
LinR.score(X,y)

RR = Ridge()
RR.fit(X,y).score(X,y)
                          columns=encoder.get_feature_names(cat_names))
    X = X.select_dtypes(exclude=['O', 'category']).join(df_cat)
    return X


# Data prep

data_file = '.../cohort_analysis.csv'
df = feature_extraction(data_file).drop('Month', axis=1)
X_train, X_test, y_train, y_test = get_the_set(df=df, target_variable='y')

encoder = fit_encoder(X_train)

X_train = run_scaling(X_train)
X_train = encode_categorical_variables(X_train, encoder)

X_test = run_scaling(X_test)
X_test = encode_categorical_variables(X_test, encoder)

# Model
model = OLS(
    y_train,
    add_constant(
        X_train.drop(['Week_1', 'Week_3', 'Cohort_active_users'],
                     axis=1))).fit()
model.summary()

# Test the assumption of Linear Regression
tester = Assumptions.Assumption_Tester_OLS(X_train, y_train)
tester.run_all()
Example #35
0
def summary_OLS(X, y):
    scaler = StandardScaler().fit(X)
    X = scaler.transform(X)
    ols = OLS(y, X).fit()
    print(ols.summary())
Example #36
0
df1 = df_dat.groupby(['gr_span'])
df1['testscr'].describe()
df1.describe()
df1.mean()

df1 = df.groupby(['gr_span', 'county'])
df1.describe()

#01——线性回归模型拟合
reg = linear_model.LinearRegression()
x = pd.DataFrame(df_dat.iloc[:, [2, 3, 4, 5, 6, 8, 9, 10, 11, 12]])
x['cons'] = 1  #加入常数项
y = df_dat['testscr']
regres = OLS(y, x, missing='drop').fit()
regres.summary()

#02--主成分分析
x1 = pd.DataFrame(df_dat.iloc[:, [2, 3, 4, 5, 6, 8, 9, 10, 11, 12]])
x_scaled = preprocessing.scale(x1)
pca = PCA(n_components=3)  #相关系数矩阵提取主成分
pca.fit(x_scaled)
pca_components = pd.DataFrame(pca.components_)  #主成分系数矩阵
pca_components.to_csv('C:/Users/ASUS/Desktop/统计计算实验课/jietu/成分系数矩阵.csv')
pca.explained_variance_  #特征值
pca.explained_variance_ratio_  #解释方差比
zdf = pd.DataFrame(x_scaled)
zi = pd.DataFrame(pca.transform(x_scaled), columns=['z1', 'z2', 'z3'])
Zdf = pd.concat([zdf, zi], axis=1)
Zdf_describe = pd.DataFrame(Zdf.describe())
Zdf_describe.to_csv('C:/Users/ASUS/Desktop/统计计算实验课/jietu/成分矩阵描述.csv')
Example #37
0
import pandas as pd
from statsmodels.regression.linear_model import OLS
import numpy as np
np.set_printoptions(suppress=True)

data = pd.read_csv('Dataset/dataset.csv')

X = data["Head Size(cm^3)"].values
y = data["Brain Weight(grams)"].values

X = np.array(X, dtype='float64')
y = np.array(y, dtype='float64')
y = np.reshape(y, (len(y), 1))

X = np.column_stack([np.ones(len(X)), X])

# Implement the statsmodel function

res = OLS(y, X).fit()

# Theta values
theta = res.params

print(theta)

# prediction
ols_pred = res.predict()

print(res.summary())
Example #38
0
    data_x, data_y, test_size=0.2, random_state=0)

# Model Creation and Fitting
multi_linear_model = LinearRegression()
multi_linear_model.fit(trainset_data_x, trainset_data_y)

# Predictions
prediction_y = multi_linear_model.predict(testset_data_x)

# Introduction to Backward Elimination
data_x = np.append(values=data_x, arr=np.ones((50, 1)).astype(int), axis=1)
# First Case
optimal_x = np.array(data_x[:, [0, 1, 2, 3, 4, 5]], dtype=float)

ols_model = OLS(endog=data_y, exog=optimal_x).fit()
ols_model.summary()

# Second Case
optimal_x = np.array(data_x[:, [0, 1, 3, 4, 5]], dtype=float)
ols_model = OLS(endog=data_y, exog=optimal_x).fit()
ols_model.summary()

# Third Case
optimal_x = np.array(data_x[:, [0, 3, 4, 5]], dtype=float)
ols_model = OLS(endog=data_y, exog=optimal_x).fit()
ols_model.summary()

# Fourth Case
optimal_x = np.array(data_x[:, [0, 3, 5]], dtype=float)
ols_model = OLS(endog=data_y, exog=optimal_x).fit()
ols_model.summary()
Example #39
0
#since there is explicit non-linear in this model, we have to add some non-linear covariates in it.

# partial residual plot
# Which attempts to show how covariate is related to dependent variable
# if we control for the effects of all other covariates
# partial residual plots look acceptable.
sns.jointplot(regr_1.params.bmi * X.bmi + regr_1.resid, X.bmi)
sns.jointplot(regr_1.params.age * X.age + regr_1.resid, X.age)


#######################
####model selection####
#######################

#original model
regr_1.summary()

#the first issue we need to solve is residual's dependent problem.
#try NO1: add an interactive covariate smoker:bmi
X_2 = X.iloc[:,:]
X_2['sm_bm'] = X_2.smoker * X_2.bmi
regr_test = OLS(y, add_constant(X_2)).fit()
regr_test.summary()
# which certainly improve the performance of the model

#try NO2: add an interactive covariate smoker:age
X_2['sm_ag'] = X_2.smoker*X_2.age
regr_test = OLS(y, add_constant(X_2)).fit()
regr_test.summary()
# which increase the performance of the model significantly
Example #40
0
y = df.iloc[:, 4].values

# data encoding
label_encode = LabelEncoder()
x[:, 0] = label_encode.fit_transform(x[:, 0])
one_hot_encode = OneHotEncoder(categorical_features=[0])
x = one_hot_encode.fit_transform(x).toarray()
x[:, 0] = numpy.ones(x.shape[0])

# standardising the cols
standard_scalar = StandardScaler()
x[:, 2:] = standard_scalar.fit_transform(x[:, 2:])

# to know which cols to remove
ols = OLS(endog=y, exog=x).fit()
print(ols.summary())

# dropping col that are having high p-values and the constant col
x = numpy.delete(x, [0, 1], axis=1)

# data splitting
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    random_state=0,
                                                    test_size=0.25)

# SVM
classifier = DecisionTreeClassifier(criterion='entropy',
                                    max_depth=5,
                                    random_state=42)
classifier.fit(x_train, y_train)