Ejemplo n.º 1
0
 def get_model(self, df, formula='np.log(pris) ~ Kmstand'):
     #if len(self.df.index) >= 30:
     f = formula
     self.df = df
     levels = list(range(0, len(df.name.unique())))
     contrast = Treatment(reference=0).code_without_intercept(levels)
     model = sm.formula.ols(f, data=self.df, missing='drop').fit()
     return model
import pandas as pd
from statsmodels.formula.api import ols
from matplotlib import pyplot as plt
# LINEAR REGRESSION USING CATEGORICAL VARIABLES
from patsy.contrasts import Treatment
d = pd.read_csv('linearRegression.csv', sep=',')
print(d.shape)
print(d.head(10))
###########################################
# this part is only for our observation is not needed for modeling:
# treatment contrasts: k categories are coded with k-1 levels!
levels = [1, 2, 3, 4]
contrast = Treatment(reference=1).code_without_intercept(
    levels)  # reference=0: use the first level as reference.
print(contrast.matrix)
print(contrast.matrix[d.race - 1, :]
      [0:20])  # it starts the levels from zero! the reason for subtractions!
############################################
# Fitting the model:
# We make treatment contrast for race!!

model = ols("write~ + C(race, Treatment) + read + math + science", data=d)
rs = model.fit()
print(rs.summary())
print('model predictions: ')
# print rs.predict(d)
plt.plot(d.write, rs.predict(d), 'ro')
plt.plot(d.write, d.write, 'r-', color='blue')
plt.xlabel('Prediction')
plt.ylabel("Write")
Ejemplo n.º 3
0
 def code_without_intercept(self, levels):
     return Treatment(reference=0).code_without_intercept(levels)
Ejemplo n.º 4
0
# write, for each level of race ((1 = Hispanic, 2 = Asian, 3 = African
# American and 4 = Caucasian)).

hsb2.groupby('race')['write'].mean()

# #### Treatment (Dummy) Coding

# Dummy coding is likely the most well known coding scheme. It compares
# each level of the categorical variable to a base reference level. The base
# reference level is the value of the intercept. It is the default contrast
# in Patsy for unordered categorical factors. The Treatment contrast matrix
# for race would be

from patsy.contrasts import Treatment
levels = [1, 2, 3, 4]
contrast = Treatment(reference=0).code_without_intercept(levels)
print(contrast.matrix)

# Here we used `reference=0`, which implies that the first level,
# Hispanic, is the reference category against which the other level effects
# are measured. As mentioned above, the columns do not sum to zero and are
# thus not independent of the intercept. To be explicit, let's look at how
# this would encode the `race` variable.

hsb2.race.head(10)

print(contrast.matrix[hsb2.race - 1, :][:20])

sm.categorical(hsb2.race.values)

# This is a bit of a trick, as the `race` category conveniently maps to
Ejemplo n.º 5
0
fico[np.isnan(fico)] = 0
loansData['log_income'] = np.log1p(loansData['Monthly.Income'])

ownership_dummies = pd.get_dummies(loansData['Home.Ownership'],
                                   prefix='ownership').iloc[:, 1:]

# concatenate the dummy variable colums onto the original DataFrame (axis)
data = pd.concat([loansData, ownership_dummies], axis=1)
data.rename(columns={'Interest.Rate': 'Interest_Rate'},
            inplace=True)  # just getting rid of some stupid errors

est = smf.ols(
    formula=
    "Interest_Rate ~ log_income + ownership_NONE + ownership_OTHER +ownership_OWN + ownership_RENT",
    data=data).fit()

est.summary()

#################################################################

loansData_ = loansData
levels = ['NONE', 'OTHER', 'RENT', 'OWN', 'MORTGAGE']
ownership_dummies1 = Treatment(reference=0).code_without_intercept(levels)

#ownership_dummies1.matrix[loansData_.house_ownership-1, :]
loansData_.rename(columns={'Interest.Rate': 'Interest_Rate'}, inplace=True)
mod = smf.ols("Interest_Rate ~ C(log_income, Treatment)",
              data=loansData_).fit()

mod.summary()
Ejemplo n.º 6
0
def contrasting():
    global c
    if c:
        #to account for multiple contrast variables
        contrastvars = []
        if "," in c:
            contrastvars = c.split(",")
        for i in range(len(contrastvars)):
            contrastvars[i] = contrastvars[i].strip()
            if " " in contrastvars[i]:
                contrastvars[i] = contrastvars[i].replace(" ", "_")
            if "/" in contrastvars[i]:  #to account for URLs
                splitted = contrastvars[i].split("/")
                contrastvars[i] = splitted[len(splitted) - 1]
        else:
            splitted = c.split("/")  #to account for URLs
            c = splitted[len(splitted) - 1]

        ind_vars_no_contrast_var = ''
        index = 1
        for i in range(len(full_model_variable_list)):
            if "/" in full_model_variable_list[i]:
                splitted = full_model_variable_list[i].split("/")
                full_model_variable_list[i] = splitted[len(splitted) - 1]
            if " " in full_model_variable_list[i]:
                full_model_variable_list[i] = full_model_variable_list[
                    i].replace(" ", "_")
        for var in full_model_variable_list:
            if var != c and not (var in contrastvars):
                if index == 1:
                    ind_vars_no_contrast_var = var
                    index += 1
                else:
                    ind_vars_no_contrast_var = ind_vars_no_contrast_var + " + " + var
        if len(contrastvars) > 0:
            contraststring = ' + '.join(contrastvars)
        else:
            if " " in c:
                c = c.replace(" ", "_")
            contraststring = c
        # With contrast (treatment coding)
        print(
            "\n\nTreatment (Dummy) Coding: Dummy coding compares each level of the categorical variable to a base reference level. The base reference level is the value of the intercept."
        )
        ctrst = Treatment(reference=0).code_without_intercept(levels)
        mod = ols(dep_var + " ~ " + ind_vars_no_contrast_var + " + C(" +
                  contraststring + ", Treatment)",
                  data=df_final)
        res = mod.fit()
        print("With contrast (treatment coding)")
        print(res.summary())
        if (o is not None):
            # concatenate data frames
            f = open(o, "a")
            f.write("\n" + full_model)
            f.write(
                "\n\n***********************************************************************************************************"
            )

            f.write(
                "\n\n\n\nTreatment (Dummy) Coding: Dummy coding compares each level of the categorical variable to a base reference level. The base reference level is the value of the intercept."
            )
            f.write("With contrast (treatment coding)")
            f.write(res.summary().as_text())
            f.close()
        # Defining the Simple class
        def _name_levels(prefix, levels):
            return ["[%s%s]" % (prefix, level) for level in levels]

        class Simple(object):
            def _simple_contrast(self, levels):
                nlevels = len(levels)
                contr = -1. / nlevels * np.ones((nlevels, nlevels - 1))
                contr[1:][np.diag_indices(nlevels -
                                          1)] = (nlevels - 1.) / nlevels
                return contr

            def code_with_intercept(self, levels):
                c = np.column_stack(
                    (np.ones(len(levels)), self._simple_contrast(levels)))
                return ContrastMatrix(c, _name_levels("Simp.", levels))

            def code_without_intercept(self, levels):
                c = self._simple_contrast(levels)
                return ContrastMatrix(c, _name_levels("Simp.", levels[:-1]))

        ctrst = Simple().code_without_intercept(levels)
        mod = ols(dep_var + " ~ " + ind_vars_no_contrast_var + " + C(" +
                  contraststring + ", Simple)",
                  data=df_final)
        res = mod.fit()
        print(
            "\n\nSimple Coding: Like Treatment Coding, Simple Coding compares each level to a fixed reference level. However, with simple coding, the intercept is the grand mean of all the levels of the factors."
        )
        print(res.summary())
        if (o is not None):
            # concatenate data frames
            f = open(o, "a")
            f.write(
                "\n\n\nSimple Coding: Like Treatment Coding, Simple Coding compares each level to a fixed reference level. However, with simple coding, the intercept is the grand mean of all the levels of the factors."
            )
            f.write(res.summary().as_text())
            f.close()

        #With contrast (sum/deviation coding)
        ctrst = Sum().code_without_intercept(levels)
        mod = ols(dep_var + " ~ " + ind_vars_no_contrast_var + " + C(" +
                  contraststring + ", Sum)",
                  data=df_final)
        res = mod.fit()
        print(
            "\n\nSum (Deviation) Coding: Sum coding compares the mean of the dependent variable for a given level to the overall mean of the dependent variable over all the levels."
        )
        print(res.summary())
        if (o is not None):
            # concatenate data frames
            f = open(o, "a")
            f.write(
                "\n\n\nSum (Deviation) Coding: Sum coding compares the mean of the dependent variable for a given level to the overall mean of the dependent variable over all the levels."
            )
            f.write(res.summary().as_text())
            f.close()

        #With contrast (backward difference coding)
        ctrst = Diff().code_without_intercept(levels)
        mod = ols(dep_var + " ~ " + ind_vars_no_contrast_var + " + C(" +
                  contraststring + ", Diff)",
                  data=df_final)
        res = mod.fit()
        print(
            "\n\nBackward Difference Coding: In backward difference coding, the mean of the dependent variable for a level is compared with the mean of the dependent variable for the prior level."
        )
        print(res.summary())
        if (o is not None):
            # concatenate data frames
            f = open(o, "a")
            f.write(
                "\n\n\nBackward Difference Coding: In backward difference coding, the mean of the dependent variable for a level is compared with the mean of the dependent variable for the prior level."
            )
            f.write(res.summary().as_text())
            f.close()

        #With contrast (Helmert coding)
        ctrst = Helmert().code_without_intercept(levels)
        mod = ols(dep_var + " ~ " + ind_vars_no_contrast_var + " + C(" +
                  contraststring + ", Helmert)",
                  data=df_final)
        res = mod.fit()
        print(
            "\n\nHelmert Coding: Our version of Helmert coding is sometimes referred to as Reverse Helmert Coding. The mean of the dependent variable for a level is compared to the mean of the dependent variable over all previous levels. Hence, the name ‘reverse’ being sometimes applied to differentiate from forward Helmert coding."
        )
        print(res.summary())
        if (o is not None):
            # concatenate data frames
            f = open(o, "a")
            f.write(
                "\n\n\nHelmert Coding: Our version of Helmert coding is sometimes referred to as Reverse Helmert Coding. The mean of the dependent variable for a level is compared to the mean of the dependent variable over all previous levels. Hence, the name ‘reverse’ being sometimes applied to differentiate from forward Helmert coding."
            )
            f.write(res.summary().as_text())
            f.close()