def test_formula(self):

        np.random.seed(542)
        n = 500
        x1 = np.random.normal(size=n)
        x2 = np.random.normal(size=n)
        x3 = np.random.normal(size=n)
        x4 = np.random.randint(0, 5, size=n)
        x4 = np.asarray(["ABCDE"[i] for i in x4])
        x5 = np.random.normal(size=n)
        y = 0.3 * x2**2 + (
            x4
            == "B") + 0.1 * (x4 == "B") * x2**2 + x5 + np.random.normal(size=n)

        df = pd.DataFrame({
            "y": y,
            "x1": x1,
            "x2": x2,
            "x3": x3,
            "x4": x4,
            "x5": x5
        })

        fml = "y ~ x1 + bs(x2, df=4) + x3 + x2*x3 + I(x1**2) + C(x4) + C(x4)*bs(x2, df=4) + x5"
        model = sm.OLS.from_formula(fml, data=df)
        result = model.fit()

        summaries = {"x1": np.mean, "x3": pctl(0.75), "x5": np.mean}

        values = {"x4": "B"}
        pr1, ci1, fvals1 = predict_functional(result, "x2", summaries, values)

        values = {"x4": "C"}
        pr2, ci2, fvals2 = predict_functional(result, "x2", summaries, values)

        plt.clf()
        fig = plt.figure()
        ax = plt.axes([0.1, 0.1, 0.7, 0.8])
        plt.plot(fvals1, pr1, '-', label='x4=B')
        plt.plot(fvals2, pr2, '-', label='x4=C')
        ha, lb = ax.get_legend_handles_labels()
        plt.figlegend(ha, lb, "center right")
        plt.xlabel("Focus variable", size=15)
        plt.ylabel("Fitted mean", size=15)
        plt.title("Linear model prediction")
        self.close_or_save(fig)

        plt.clf()
        fig = plt.figure()
        ax = plt.axes([0.1, 0.1, 0.7, 0.8])
        plt.plot(fvals1, pr1, '-', label='x4=B')
        plt.fill_between(fvals1, ci1[:, 0], ci1[:, 1], color='grey')
        plt.plot(fvals2, pr2, '-', label='x4=C')
        plt.fill_between(fvals2, ci2[:, 0], ci2[:, 1], color='grey')
        ha, lb = ax.get_legend_handles_labels()
        plt.figlegend(ha, lb, "center right")
        plt.xlabel("Focus variable", size=15)
        plt.ylabel("Fitted mean", size=15)
        plt.title("Linear model prediction")
        self.close_or_save(fig)
    def test_formula(self):

        np.random.seed(542)
        n = 500
        x1 = np.random.normal(size=n)
        x2 = np.random.normal(size=n)
        x3 = np.random.normal(size=n)
        x4 = np.random.randint(0, 5, size=n)
        x4 = np.asarray(["ABCDE"[i] for i in x4])
        x5 = np.random.normal(size=n)
        y = 0.3*x2**2 + (x4 == "B") + 0.1*(x4 == "B")*x2**2 + x5 + np.random.normal(size=n)

        df = pd.DataFrame({"y": y, "x1": x1, "x2": x2, "x3": x3, "x4": x4, "x5": x5})

        fml = "y ~ x1 + bs(x2, df=4) + x3 + x2*x3 + I(x1**2) + C(x4) + C(x4)*bs(x2, df=4) + x5"
        model = sm.OLS.from_formula(fml, data=df)
        result = model.fit()

        summaries = {"x1": np.mean, "x3": pctl(0.75), "x5": np.mean}

        values = {"x4": "B"}
        pr1, ci1, fvals1 = predict_functional(result, "x2", summaries, values)

        values = {"x4": "C"}
        pr2, ci2, fvals2 = predict_functional(result, "x2", summaries, values)

        plt.clf()
        fig = plt.figure()
        ax = plt.axes([0.1, 0.1, 0.7, 0.8])
        plt.plot(fvals1, pr1, '-', label='x4=B')
        plt.plot(fvals2, pr2, '-', label='x4=C')
        ha, lb = ax.get_legend_handles_labels()
        plt.figlegend(ha, lb, "center right")
        plt.xlabel("Focus variable", size=15)
        plt.ylabel("Fitted mean", size=15)
        plt.title("Linear model prediction")
        self.close_or_save(fig)

        plt.clf()
        fig = plt.figure()
        ax = plt.axes([0.1, 0.1, 0.7, 0.8])
        plt.plot(fvals1, pr1, '-', label='x4=B')
        plt.fill_between(fvals1, ci1[:, 0], ci1[:, 1], color='grey')
        plt.plot(fvals2, pr2, '-', label='x4=C')
        plt.fill_between(fvals2, ci2[:, 0], ci2[:, 1], color='grey')
        ha, lb = ax.get_legend_handles_labels()
        plt.figlegend(ha, lb, "center right")
        plt.xlabel("Focus variable", size=15)
        plt.ylabel("Fitted mean", size=15)
        plt.title("Linear model prediction")
        self.close_or_save(fig)
    def test_lm_contrast(self):

        np.random.seed(542)
        n = 200
        x1 = np.random.normal(size=n)
        x2 = np.random.normal(size=n)
        x3 = np.random.normal(size=n)
        y = x1 + 2*x2 + x3 - x1*x2 + x2*x3 + np.random.normal(size=n)

        df = pd.DataFrame({"y": y, "x1": x1, "x2": x2, "x3": x3})

        fml = "y ~ x1 + x2 + x3 + x1*x2 + x2*x3"
        model = sm.OLS.from_formula(fml, data=df)
        result = model.fit()

        values = {"x2": 1, "x3": 1} # y = 4
        values2 = {"x2": 0, "x3": 0} # y = x1
        pr, cb, fvals = predict_functional(result, "x1", values=values,
                                           values2=values2, ci_method='scheffe')

        plt.clf()
        fig = plt.figure()
        ax = plt.axes([0.1, 0.1, 0.67, 0.8])
        plt.plot(fvals, pr, '-', label="Estimate", color='orange', lw=4)
        plt.plot(fvals, 4 - fvals, '-', label="Truth", color='lime', lw=4)
        plt.fill_between(fvals, cb[:, 0], cb[:, 1], color='grey')
        ha, lb = ax.get_legend_handles_labels()
        leg = plt.figlegend(ha, lb, "center right")
        leg.draw_frame(False)
        plt.xlabel("Focus variable", size=15)
        plt.ylabel("Mean contrast", size=15)
        plt.title("Linear model contrast")
        self.close_or_save(fig)
    def test_glm_formula_contrast(self):

        np.random.seed(542)
        n = 50
        x1 = np.random.normal(size=n)
        x2 = np.random.normal(size=n)
        x3 = np.random.normal(size=n)
        mn = 5 + 0.1*x1 + 0.1*x2 + 0.1*x3 - 0.1*x1*x2
        y = np.random.poisson(np.exp(mn), size=len(mn))

        df = pd.DataFrame({"y": y, "x1": x1, "x2": x2, "x3": x3})

        fml = "y ~ x1 + x2 + x3 + x1*x2"
        model = sm.GLM.from_formula(fml, data=df, family=sm.families.Poisson())
        result = model.fit()

        values = {"x2": 1, "x3": 1} # y = 5.2
        values2 = {"x2": 0, "x3": 0} # y = 5 + 0.1*x1
        pr, cb, fvals = predict_functional(result, "x1", values=values,
                                           values2=values2, ci_method='simultaneous')

        plt.clf()
        fig = plt.figure()
        ax = plt.axes([0.1, 0.1, 0.67, 0.8])
        plt.plot(fvals, pr, '-', label="Estimate", color='orange', lw=4)
        plt.plot(fvals, 0.2 - 0.1*fvals, '-', label="Truth", color='lime', lw=4)
        plt.fill_between(fvals, cb[:, 0], cb[:, 1], color='grey')
        ha, lb = ax.get_legend_handles_labels()
        leg = plt.figlegend(ha, lb, "center right")
        leg.draw_frame(False)
        plt.xlabel("Focus variable", size=15)
        plt.ylabel("Linear predictor contrast", size=15)
        plt.title("Poisson regression contrast")
        self.close_or_save(fig)
    def test_glm_formula_contrast(self):

        np.random.seed(542)
        n = 50
        x1 = np.random.normal(size=n)
        x2 = np.random.normal(size=n)
        x3 = np.random.normal(size=n)
        mn = 5 + 0.1*x1 + 0.1*x2 + 0.1*x3 - 0.1*x1*x2
        y = np.random.poisson(np.exp(mn), size=len(mn))

        df = pd.DataFrame({"y": y, "x1": x1, "x2": x2, "x3": x3})

        fml = "y ~ x1 + x2 + x3 + x1*x2"
        model = sm.GLM.from_formula(fml, data=df, family=sm.families.Poisson())
        result = model.fit()

        values = {"x2": 1, "x3": 1} # y = 5.2
        values2 = {"x2": 0, "x3": 0} # y = 5 + 0.1*x1
        pr, cb, fvals = predict_functional(result, "x1", values=values,
                                           values2=values2, ci_method='simultaneous')

        plt.clf()
        fig = plt.figure()
        ax = plt.axes([0.1, 0.1, 0.67, 0.8])
        plt.plot(fvals, pr, '-', label="Estimate", color='orange', lw=4)
        plt.plot(fvals, 0.2 - 0.1*fvals, '-', label="Truth", color='lime', lw=4)
        plt.fill_between(fvals, cb[:, 0], cb[:, 1], color='grey')
        ha, lb = ax.get_legend_handles_labels()
        leg = plt.figlegend(ha, lb, "center right")
        leg.draw_frame(False)
        plt.xlabel("Focus variable", size=15)
        plt.ylabel("Linear predictor contrast", size=15)
        plt.title("Poisson regression contrast")
        self.close_or_save(fig)
    def test_lm_contrast(self):

        np.random.seed(542)
        n = 200
        x1 = np.random.normal(size=n)
        x2 = np.random.normal(size=n)
        x3 = np.random.normal(size=n)
        y = x1 + 2*x2 + x3 - x1*x2 + x2*x3 + np.random.normal(size=n)

        df = pd.DataFrame({"y": y, "x1": x1, "x2": x2, "x3": x3})

        fml = "y ~ x1 + x2 + x3 + x1*x2 + x2*x3"
        model = sm.OLS.from_formula(fml, data=df)
        result = model.fit()

        values = {"x2": 1, "x3": 1} # y = 4
        values2 = {"x2": 0, "x3": 0} # y = x1
        pr, cb, fvals = predict_functional(result, "x1", values=values,
                                           values2=values2, ci_method='scheffe')

        plt.clf()
        fig = plt.figure()
        ax = plt.axes([0.1, 0.1, 0.67, 0.8])
        plt.plot(fvals, pr, '-', label="Estimate", color='orange', lw=4)
        plt.plot(fvals, 4 - fvals, '-', label="Truth", color='lime', lw=4)
        plt.fill_between(fvals, cb[:, 0], cb[:, 1], color='grey')
        ha, lb = ax.get_legend_handles_labels()
        leg = plt.figlegend(ha, lb, "center right")
        leg.draw_frame(False)
        plt.xlabel("Focus variable", size=15)
        plt.ylabel("Mean contrast", size=15)
        plt.title("Linear model contrast")
        self.close_or_save(fig)
    def test_noformula_prediction(self):

        np.random.seed(6434)
        n = 200
        x1 = np.random.normal(size=n)
        x2 = np.random.normal(size=n)
        x3 = np.random.normal(size=n)
        y = x1 - x2 + np.random.normal(size=n)

        exog = np.vstack((x1, x2, x3)).T

        model = sm.OLS(y, exog)
        result = model.fit()

        summaries = {"x3": pctl(0.75)}
        values = {"x2": 1}
        pr1, ci1, fvals1 = predict_functional(result, "x1", summaries, values)

        values = {"x2": -1}
        pr2, ci2, fvals2 = predict_functional(result, "x1", summaries, values)

        plt.clf()
        fig = plt.figure()
        ax = plt.axes([0.1, 0.1, 0.7, 0.8])
        plt.plot(fvals1, pr1, '-', label='x2=1', lw=4, alpha=0.6, color='orange')
        plt.plot(fvals2, pr2, '-', label='x2=-1', lw=4, alpha=0.6, color='lime')
        ha, lb = ax.get_legend_handles_labels()
        leg = plt.figlegend(ha, lb, "center right")
        leg.draw_frame(False)
        plt.xlabel("Focus variable", size=15)
        plt.ylabel("Fitted mean", size=15)
        plt.title("Linear model prediction")
        self.close_or_save(fig)

        plt.clf()
        fig = plt.figure()
        ax = plt.axes([0.1, 0.1, 0.7, 0.8])
        plt.plot(fvals1, pr1, '-', label='x2=1', lw=4, alpha=0.6, color='orange')
        plt.fill_between(fvals1, ci1[:, 0], ci1[:, 1], color='grey')
        plt.plot(fvals1, pr2, '-', label='x2=1', lw=4, alpha=0.6, color='lime')
        plt.fill_between(fvals2, ci2[:, 0], ci2[:, 1], color='grey')
        ha, lb = ax.get_legend_handles_labels()
        plt.figlegend(ha, lb, "center right")
        plt.xlabel("Focus variable", size=15)
        plt.ylabel("Fitted mean", size=15)
        plt.title("Linear model prediction")
        self.close_or_save(fig)
    def test_noformula_prediction(self):

        np.random.seed(6434)
        n = 200
        x1 = np.random.normal(size=n)
        x2 = np.random.normal(size=n)
        x3 = np.random.normal(size=n)
        y = x1 - x2 + np.random.normal(size=n)

        exog = np.vstack((x1, x2, x3)).T

        model = sm.OLS(y, exog)
        result = model.fit()

        summaries = {"x3": pctl(0.75)}
        values = {"x2": 1}
        pr1, ci1, fvals1 = predict_functional(result, "x1", summaries, values)

        values = {"x2": -1}
        pr2, ci2, fvals2 = predict_functional(result, "x1", summaries, values)

        plt.clf()
        fig = plt.figure()
        ax = plt.axes([0.1, 0.1, 0.7, 0.8])
        plt.plot(fvals1, pr1, '-', label='x2=1', lw=4, alpha=0.6, color='orange')
        plt.plot(fvals2, pr2, '-', label='x2=-1', lw=4, alpha=0.6, color='lime')
        ha, lb = ax.get_legend_handles_labels()
        leg = plt.figlegend(ha, lb, "center right")
        leg.draw_frame(False)
        plt.xlabel("Focus variable", size=15)
        plt.ylabel("Fitted mean", size=15)
        plt.title("Linear model prediction")
        self.close_or_save(fig)

        plt.clf()
        fig = plt.figure()
        ax = plt.axes([0.1, 0.1, 0.7, 0.8])
        plt.plot(fvals1, pr1, '-', label='x2=1', lw=4, alpha=0.6, color='orange')
        plt.fill_between(fvals1, ci1[:, 0], ci1[:, 1], color='grey')
        plt.plot(fvals1, pr2, '-', label='x2=1', lw=4, alpha=0.6, color='lime')
        plt.fill_between(fvals2, ci2[:, 0], ci2[:, 1], color='grey')
        ha, lb = ax.get_legend_handles_labels()
        plt.figlegend(ha, lb, "center right")
        plt.xlabel("Focus variable", size=15)
        plt.ylabel("Fitted mean", size=15)
        plt.title("Linear model prediction")
        self.close_or_save(fig)
    def test_glm_formula(self):

        np.random.seed(542)
        n = 500
        x1 = np.random.normal(size=n)
        x2 = np.random.normal(size=n)
        x3 = np.random.randint(0, 3, size=n)
        x3 = np.asarray(["ABC"[i] for i in x3])
        lin_pred = -1 + 0.5*x1**2 + (x3 == "B")
        prob = 1 / (1 + np.exp(-lin_pred))
        y = 1 * (np.random.uniform(size=n) < prob)

        df = pd.DataFrame({"y": y, "x1": x1, "x2": x2, "x3": x3})

        fml = "y ~ x1 + I(x1**2) + x2 + C(x3)"
        model = sm.GLM.from_formula(fml, family=sm.families.Binomial(), data=df)
        result = model.fit()
        summaries = {"x2": np.mean}

        for linear in False, True:

            values = {"x3": "B"}
            pr1, ci1, fvals1 = predict_functional(result, "x1", summaries, values, linear=linear)

            values = {"x3": "C"}
            pr2, ci2, fvals2 = predict_functional(result, "x1", summaries, values, linear=linear)

            exact1 = -1 + 0.5*fvals1**2 + 1
            exact2 = -1 + 0.5*fvals2**2

            if not linear:
                exact1 = 1 / (1 + np.exp(-exact1))
                exact2 = 1 / (1 + np.exp(-exact2))

            plt.clf()
            fig = plt.figure()
            ax = plt.axes([0.1, 0.1, 0.7, 0.8])
            plt.plot(fvals1, pr1, '-', label='x3=B')
            plt.plot(fvals2, pr2, '-', label='x3=C')
            plt.plot(fvals1, exact1, '-', label='x3=B (exact)')
            plt.plot(fvals2, exact2, '-', label='x3=C (exact)')
            ha, lb = ax.get_legend_handles_labels()
            plt.figlegend(ha, lb, "center right")
            plt.xlabel("Focus variable", size=15)
            if linear:
                plt.ylabel("Fitted linear predictor", size=15)
            else:
                plt.ylabel("Fitted probability", size=15)
            plt.title("Binomial GLM prediction")
            self.close_or_save(fig)

            plt.clf()
            fig = plt.figure()
            ax = plt.axes([0.1, 0.1, 0.7, 0.8])
            plt.plot(fvals1, pr1, '-', label='x3=B', color='orange')
            plt.fill_between(fvals1, ci1[:, 0], ci1[:, 1], color='grey')
            plt.plot(fvals2, pr2, '-', label='x3=C', color='lime')
            plt.fill_between(fvals2, ci2[:, 0], ci2[:, 1], color='grey')
            ha, lb = ax.get_legend_handles_labels()
            plt.figlegend(ha, lb, "center right")
            plt.xlabel("Focus variable", size=15)
            if linear:
                plt.ylabel("Fitted linear predictor", size=15)
            else:
                plt.ylabel("Fitted probability", size=15)
            plt.title("Binomial GLM prediction")
            self.close_or_save(fig)
Ejemplo n.º 10
0
mort = mort.loc[mort.SALE_DATE >= 365.25 * 20]
mort = mort[[
    "log_MORTGAGE_AMOUNT", "log_SALE_AMOUNT", "SALE_DATE", "FIPS", "year"
]].dropna()

model1 = sm.OLS.from_formula(
    "log_MORTGAGE_AMOUNT ~ bs(log_SALE_AMOUNT, 8) * bs(SALE_DATE, 8)",
    data=mort)
result1 = model1.fit()

plt.clf()
ax = plt.axes([0.1, 0.12, 0.75, 0.8])
for k in range(7):
    pred, cb, fvals = predict_functional(
        result1,
        "log_SALE_AMOUNT",
        values={"SALE_DATE": 365.25 * (25 + 5 * k)})
    plt.plot(fvals, pred, '-', label="%4d" % (1960 + 25 + 5 * k))
ha, lb = ax.get_legend_handles_labels()
leg = plt.figlegend(ha, lb, "center right")
leg.draw_frame(False)
plt.xlabel("log2 Sale Amount", size=15)
plt.ylabel("log2 Mortgage Amount", size=15)
plt.grid(True)
pdf.savefig()

plt.clf()
ax = plt.axes([0.12, 0.12, 0.75, 0.8])
for k in [16, 17, 18, 19]:
    pred, cb, fvals = predict_functional(result1,
                                         "SALE_DATE",
Ejemplo n.º 11
0
plt.grid(True)
plt.plot(result1.fittedvalues, result1.resid, 'o', alpha=0.5)
plt.xlabel("Fitted values", size=15)
plt.ylabel("Residuals", size=15)


# __Plots of the fitted conditional mean function__
# 
# 

# It's often useful to plot the fitted relationship between variables in a regression model.  To do this in a multiple regression, we usually hold all but one of the covariates (independent variables) fixed at a particular value, often at the mean value.  Here is how to make such a plot with the multiple regression model fit above.  We hold the population growth rate and real cost of borrowing fixed at their mean values and display the fitted log housing price index in terms of log household income.

# In[24]:

pr, cb, xv = predict_functional(result1, 'lpci_real', summaries={'pgr' : np.mean, 'rcb' : np.mean})

plt.clf()
plt.grid(True)
plt.plot(xv, pr, '-', color='orange', lw=4)
plt.fill_between(xv, cb[:, 0], cb[:, 1], color='grey')
plt.xlabel("Log disposable income", size=15)
plt.ylabel("Log house price index", size=15)


# To see how two predictors work together in the regression model, we can plot one of them along the horizontal axis, and plot different lines for different fixed values of the other covariate.  We do this below with the population growth rate and disposable income variables.

# In[25]:

yp = []
for k in range(3):
Ejemplo n.º 12
0
    def test_scb(self):

        np.random.seed(473)
        n = 100
        x = np.random.normal(size=(n,4))
        x[:, 0] = 1

        for fam_name in "poisson", "binomial", "gaussian":

            if fam_name == "poisson":
                y = np.random.poisson(20, size=n)
                fam = sm.families.Poisson()
                true_mean = 20
                true_lp = np.log(20)
            elif fam_name == "binomial":
                y = 1 * (np.random.uniform(size=n) < 0.5)
                fam = sm.families.Binomial()
                true_mean = 0.5
                true_lp = 0
            elif fam_name == "gaussian":
                y = np.random.normal(size=n)
                fam = sm.families.Gaussian()
                true_mean = 0
                true_lp = 0

            model = sm.GLM(y, x, family=fam)
            result = model.fit()

            # CB is for linear predictor or mean response
            for linear in False, True:

                true = true_lp if linear else true_mean

                values = {'const': 1, "x2": 0}
                summaries = {"x3": np.mean}
                pred1, cb1, fvals1 = predict_functional(result, "x1",
                            values=values, summaries=summaries, linear=linear)
                pred2, cb2, fvals2 = predict_functional(result, "x1",
                            values=values, summaries=summaries,
                            ci_method='simultaneous', linear=linear)

                plt.clf()
                fig = plt.figure()
                ax = plt.axes([0.1, 0.1, 0.58, 0.8])
                plt.plot(fvals1, pred1, '-', color='black', label='Estimate')
                plt.plot(fvals1, true * np.ones(len(pred1)), '-', color='purple',
                         label='Truth')
                plt.plot(fvals1, cb1[:, 0], color='blue', label='Pointwise CB')
                plt.plot(fvals1, cb1[:, 1], color='blue')
                plt.plot(fvals2, cb2[:, 0], color='green', label='Simultaneous CB')
                plt.plot(fvals2, cb2[:, 1], color='green')
                ha, lb = ax.get_legend_handles_labels()
                leg = plt.figlegend(ha, lb, "center right")
                leg.draw_frame(False)
                plt.xlabel("Focus variable", size=15)
                if linear:
                    plt.ylabel("Linear predictor", size=15)
                else:
                    plt.ylabel("Fitted mean", size=15)
                plt.title("%s family prediction" % fam_name.capitalize())

                self.close_or_save(fig)
Ejemplo n.º 13
0
                             groups="fips")
result7 = model7.fit(maxiter=5)

# Plot the estimated autocorrelations
for result in result3, result6:
    plt.clf()
    plt.plot(result.cov_struct.dep_params)
    plt.grid(True)
    plt.gca().set_xticks(range(18))
    plt.xlim(0, 15)
    plt.xlabel("Lag (years)", size=15)
    plt.ylabel("Autocorrelation", size=15)
    pdf.savefig()

from statsmodels.sandbox import predict_functional
pred1, cb1, fvals1 = predict_functional.predict_functional(
    result5, "year", ci_method="simultaneous", values={"logpop": 10})
pred2, cb2, fvals2 = predict_functional.predict_functional(
    result5, "year", ci_method="simultaneous", values={"logpop": 11})
pred3, cb3, fvals3 = predict_functional.predict_functional(
    result5, "year", ci_method="simultaneous", values={"logpop": 12})

for k in range(2):
    plt.clf()
    plt.axes([0.15, 0.1, 0.72, 0.86])
    if k == 0:
        # Plot fitted values on the log scale
        plt.plot(fvals1, pred1, '-', label="10")
        plt.plot(fvals2, pred2, '-', label="11")
        plt.plot(fvals3, pred3, '-', label="12")
        plt.fill_between(fvals1, cb1[:, 0], cb1[:, 1], color='grey')
        plt.fill_between(fvals2, cb2[:, 0], cb2[:, 1], color='grey')
Ejemplo n.º 14
0
# This type of plot requires us to fix the values of all variables other
# than the independent variable (SBP here), and one independent variable
# that we call the *focus variable* (which is age here).  Below we fix
# the gender as "female" and the BMI as 25.  Thus, the graphs below show
# the relationship between expected SBP and age for women with BMI equal
# to 25.

from statsmodels.sandbox.predict_functional import predict_functional

# Fix certain variables at reference values.  Not all of these
# variables are used here, but we provide them with a value anyway
# to prevent a warning message from appearing.
values = {"RIAGENDRx": "Female", "RIAGENDR": 2, "BMXBMI": 25,
          "DMDEDUC2": 1, "RIDRETH1": 1, "SMQ020": 1}

pr, cb, fv = predict_functional(result, "RIDAGEYR",
                values=values, ci_method="simultaneous")

ax = sns.lineplot(fv, pr, lw=4)
ax.fill_between(fv, cb[:, 0], cb[:, 1], color='grey', alpha=0.4)
ax.set_xlabel("Age")
_ = ax.set_ylabel("SBP")

# The analogous plot for BMI is shown next.  Here we fix the gender as
# "female" and the age at 50, so we are looking at the relationship
# between expected SBP and age for women of age 50.

del values["BMXBMI"]
values["RIDAGEYR"] = 50
pr, cb, fv = predict_functional(result, "BMXBMI",
                values=values, ci_method="simultaneous")
Ejemplo n.º 15
0
    def test_glm_formula(self):

        np.random.seed(542)
        n = 500
        x1 = np.random.normal(size=n)
        x2 = np.random.normal(size=n)
        x3 = np.random.randint(0, 3, size=n)
        x3 = np.asarray(["ABC"[i] for i in x3])
        lin_pred = -1 + 0.5 * x1**2 + (x3 == "B")
        prob = 1 / (1 + np.exp(-lin_pred))
        y = 1 * (np.random.uniform(size=n) < prob)

        df = pd.DataFrame({"y": y, "x1": x1, "x2": x2, "x3": x3})

        fml = "y ~ x1 + I(x1**2) + x2 + C(x3)"
        model = sm.GLM.from_formula(fml,
                                    family=sm.families.Binomial(),
                                    data=df)
        result = model.fit()
        summaries = {"x2": np.mean}

        for linear in False, True:

            values = {"x3": "B"}
            pr1, ci1, fvals1 = predict_functional(result,
                                                  "x1",
                                                  summaries,
                                                  values,
                                                  linear=linear)

            values = {"x3": "C"}
            pr2, ci2, fvals2 = predict_functional(result,
                                                  "x1",
                                                  summaries,
                                                  values,
                                                  linear=linear)

            exact1 = -1 + 0.5 * fvals1**2 + 1
            exact2 = -1 + 0.5 * fvals2**2

            if not linear:
                exact1 = 1 / (1 + np.exp(-exact1))
                exact2 = 1 / (1 + np.exp(-exact2))

            plt.clf()
            fig = plt.figure()
            ax = plt.axes([0.1, 0.1, 0.7, 0.8])
            plt.plot(fvals1, pr1, '-', label='x3=B')
            plt.plot(fvals2, pr2, '-', label='x3=C')
            plt.plot(fvals1, exact1, '-', label='x3=B (exact)')
            plt.plot(fvals2, exact2, '-', label='x3=C (exact)')
            ha, lb = ax.get_legend_handles_labels()
            plt.figlegend(ha, lb, "center right")
            plt.xlabel("Focus variable", size=15)
            if linear:
                plt.ylabel("Fitted linear predictor", size=15)
            else:
                plt.ylabel("Fitted probability", size=15)
            plt.title("Binomial GLM prediction")
            self.close_or_save(fig)

            plt.clf()
            fig = plt.figure()
            ax = plt.axes([0.1, 0.1, 0.7, 0.8])
            plt.plot(fvals1, pr1, '-', label='x3=B', color='orange')
            plt.fill_between(fvals1, ci1[:, 0], ci1[:, 1], color='grey')
            plt.plot(fvals2, pr2, '-', label='x3=C', color='lime')
            plt.fill_between(fvals2, ci2[:, 0], ci2[:, 1], color='grey')
            ha, lb = ax.get_legend_handles_labels()
            plt.figlegend(ha, lb, "center right")
            plt.xlabel("Focus variable", size=15)
            if linear:
                plt.ylabel("Fitted linear predictor", size=15)
            else:
                plt.ylabel("Fitted probability", size=15)
            plt.title("Binomial GLM prediction")
            self.close_or_save(fig)
Ejemplo n.º 16
0
df = df.loc[df.SALE_DATE >= 365.25 * 20]
df["age"] = df.year - df.YEAR_BUILT

fml = "log_SALE_AMOUNT ~ bs(year, 6) * bs(age, 6) + bs(year, 6) * (bs(LAND_SQUARE_FOOTAGE, 6) + bs(LIVING_SQUARE_FEET, 6) + bs(age, 6))"
model = sm.OLS.from_formula(fml, df)
result = model.fit()

pdf = PdfPages("salesprice_lm.pdf")

plt.clf()
for age in 0, 10, 20, 40:
    pred, cb, fvals = predict_functional(result,
                                         "year",
                                         values={"age": age},
                                         summaries={
                                             "LAND_SQUARE_FOOTAGE": np.median,
                                             "LIVING_SQUARE_FEET": np.median
                                         })
    plt.plot(fvals, pred, '-', label=str(age))

plt.grid(True)
ha, lb = plt.gca().get_legend_handles_labels()
leg = plt.figlegend(ha, lb, "center right")
leg.draw_frame(False)
plt.ylabel("Sales price (log2)", size=15)
plt.xlabel("Year of sale", size=15)

pdf.savefig()

pdf.close()
Ejemplo n.º 17
0
# +
from statsmodels.sandbox.predict_functional import predict_functional

values = {
    "RIAGENDRx": "Female",
    "RIAGENDR": 1,
    "BMXBMI": 25,
    "DMDEDUC2": 1,
    "RIDRETH1": 1,
    "SMQ020": 1,
    "DMDEDUC2x": "College",
    "BPXSY1": 120
}

pr, cb, fv = predict_functional(result,
                                "RIDAGEYR",
                                values=values,
                                ci_method="simultaneous")

ax = sns.lineplot(fv, pr, lw=4)
ax.fill_between(fv, cb[:, 0], cb[:, 1], color='grey', alpha=0.4)
ax.set_xlabel("Age")
_ = ax.set_ylabel("Smoking")
# -

# We can display the same plot in terms of probabilities instead of in
# terms of log odds.  The probability can be obtained from the log odds
# using the relationship `p = 1 / (1 + exp(-o))` where `o` is the log
# odds.  Note that while the age and log odds are linearly related, age
# has a curved relationship with probability.  This is necessary since
# probabilities must remain between 0 and 1, a linear relationship would
# eventually exit the domain.
Ejemplo n.º 18
0
    def test_scb(self):

        np.random.seed(473)
        n = 100
        x = np.random.normal(size=(n, 4))
        x[:, 0] = 1

        for fam_name in "poisson", "binomial", "gaussian":

            if fam_name == "poisson":
                y = np.random.poisson(20, size=n)
                fam = sm.families.Poisson()
                true_mean = 20
                true_lp = np.log(20)
            elif fam_name == "binomial":
                y = 1 * (np.random.uniform(size=n) < 0.5)
                fam = sm.families.Binomial()
                true_mean = 0.5
                true_lp = 0
            elif fam_name == "gaussian":
                y = np.random.normal(size=n)
                fam = sm.families.Gaussian()
                true_mean = 0
                true_lp = 0

            model = sm.GLM(y, x, family=fam)
            result = model.fit()

            # CB is for linear predictor or mean response
            for linear in False, True:

                true = true_lp if linear else true_mean

                values = {'const': 1, "x2": 0}
                summaries = {"x3": np.mean}
                pred1, cb1, fvals1 = predict_functional(result,
                                                        "x1",
                                                        values=values,
                                                        summaries=summaries,
                                                        linear=linear)
                pred2, cb2, fvals2 = predict_functional(
                    result,
                    "x1",
                    values=values,
                    summaries=summaries,
                    ci_method='simultaneous',
                    linear=linear)

                plt.clf()
                fig = plt.figure()
                ax = plt.axes([0.1, 0.1, 0.58, 0.8])
                plt.plot(fvals1, pred1, '-', color='black', label='Estimate')
                plt.plot(fvals1,
                         true * np.ones(len(pred1)),
                         '-',
                         color='purple',
                         label='Truth')
                plt.plot(fvals1, cb1[:, 0], color='blue', label='Pointwise CB')
                plt.plot(fvals1, cb1[:, 1], color='blue')
                plt.plot(fvals2,
                         cb2[:, 0],
                         color='green',
                         label='Simultaneous CB')
                plt.plot(fvals2, cb2[:, 1], color='green')
                ha, lb = ax.get_legend_handles_labels()
                leg = plt.figlegend(ha, lb, "center right")
                leg.draw_frame(False)
                plt.xlabel("Focus variable", size=15)
                if linear:
                    plt.ylabel("Linear predictor", size=15)
                else:
                    plt.ylabel("Fitted mean", size=15)
                plt.title("%s family prediction" % fam_name.capitalize())

                self.close_or_save(fig)
Ejemplo n.º 19
0
#https://towardsdatascience.com/logistic-regression-model-fitting-and-finding-the-correlation-p-value-z-score-confidence-8330fb86db19
#With help from this site

# In[92]:

from statsmodels.sandbox.predict_functional import predict_functional

# In[104]:

values = {"hist": 0, "tumorsize": 50, "accinsitu": 0, "lymphinv": 0}

# In[105]:

pr, cb, fv = predict_functional(result,
                                "age",
                                values=values,
                                ci_method="simultaneous")

# In[106]:

ax = sns.lineplot(fv, pr, lw=4)
ax.fill_between(fv, cb[:, 0], cb[:, 1], color='grey', alpha=0.4)
ax.set_xlabel("age")
ax.set_ylabel("Re-excision")

ax.set_title('Fitted Model: Log-odd probability of Age by Re-excision')

#This plot of fitted log-odds  visualizes the effect of age on reexcision for
#hist=0, tumorsize=23, accinsitu=0 and lumphinv=0 by the glm fitted model
#Slight negative correlation of age and RE are visible in this plot
#For the specific described variables