def fit_helmert_coding(col, values, handle_missing, handle_unknown): if handle_missing == 'value': values = values[values > 0] values_to_encode = values.get_values() if len(values) < 2: return pd.DataFrame(index=values_to_encode) if handle_unknown == 'indicator': values_to_encode = np.append(values_to_encode, -1) helmert_contrast_matrix = Helmert().code_without_intercept( values_to_encode) df = pd.DataFrame( data=helmert_contrast_matrix.matrix, index=values_to_encode, columns=[ str(col) + '_%d' % (i, ) for i in range(len(helmert_contrast_matrix.column_suffixes)) ]) if handle_unknown == 'return_nan': df.loc[-1] = np.nan elif handle_unknown == 'value': df.loc[-1] = np.zeros(len(values_to_encode) - 1) if handle_missing == 'return_nan': df.loc[values.loc[np.nan]] = np.nan elif handle_missing == 'value': df.loc[-2] = np.zeros(len(values_to_encode) - 1) return df
def fit_helmert_coding(values): if len(values) < 2: return pd.DataFrame() helmert_contrast_matrix = Helmert().code_without_intercept(values) df = pd.DataFrame(data=helmert_contrast_matrix.matrix, columns=helmert_contrast_matrix.column_suffixes) df.index += 1 df.loc[0] = np.zeros(len(values) - 1) return df
def buildEffectContrastMatrix(self, i): def convert(c): n = c.shape[1] for i in range(n): j = n - i - 1 c[:, j] /= c[j + 1, j] s = np.power(c[j + 1, :], 2).sum() c[:, j] *= (np.sqrt(1 - 1. / (n + 1) - s + np.power(c[j + 1, j], 2))) return c h = Helmert().code_without_intercept(range(self.mk[i])).matrix if self.helmertConvert: h = convert(h) return h
res.params["C(race, Diff)[D.1]"] hsb2.groupby('race').mean()["write"][2] - hsb2.groupby( 'race').mean()["write"][1] # ### Helmert Coding # Our version of Helmert coding is sometimes referred to as Reverse # Helmert Coding. The mean of the dependent variable for a level is compared # to the mean of the dependent variable over all previous levels. Hence, the # name 'reverse' being sometimes applied to differentiate from forward # Helmert coding. This comparison does not make much sense for a nominal # variable such as race, but we would use the Helmert contrast like so: from patsy.contrasts import Helmert contrast = Helmert().code_without_intercept(levels) print(contrast.matrix) mod = ols("write ~ C(race, Helmert)", data=hsb2) res = mod.fit() print(res.summary()) # To illustrate, the comparison on level 4 is the mean of the dependent # variable at the previous three levels taken from the mean at level 4 grouped = hsb2.groupby('race') grouped.mean()["write"][4] - grouped.mean()["write"][:3].mean() # As you can see, these are only equal up to a constant. Other versions of # the Helmert contrast give the actual difference in means. Regardless, the # hypothesis tests are the same.
def contrasting(): global c if c: #to account for multiple contrast variables contrastvars = [] if "," in c: contrastvars = c.split(",") for i in range(len(contrastvars)): contrastvars[i] = contrastvars[i].strip() if " " in contrastvars[i]: contrastvars[i] = contrastvars[i].replace(" ", "_") if "/" in contrastvars[i]: #to account for URLs splitted = contrastvars[i].split("/") contrastvars[i] = splitted[len(splitted) - 1] else: splitted = c.split("/") #to account for URLs c = splitted[len(splitted) - 1] ind_vars_no_contrast_var = '' index = 1 for i in range(len(full_model_variable_list)): if "/" in full_model_variable_list[i]: splitted = full_model_variable_list[i].split("/") full_model_variable_list[i] = splitted[len(splitted) - 1] if " " in full_model_variable_list[i]: full_model_variable_list[i] = full_model_variable_list[ i].replace(" ", "_") for var in full_model_variable_list: if var != c and not (var in contrastvars): if index == 1: ind_vars_no_contrast_var = var index += 1 else: ind_vars_no_contrast_var = ind_vars_no_contrast_var + " + " + var if len(contrastvars) > 0: contraststring = ' + '.join(contrastvars) else: if " " in c: c = c.replace(" ", "_") contraststring = c # With contrast (treatment coding) print( "\n\nTreatment (Dummy) Coding: Dummy coding compares each level of the categorical variable to a base reference level. The base reference level is the value of the intercept." ) ctrst = Treatment(reference=0).code_without_intercept(levels) mod = ols(dep_var + " ~ " + ind_vars_no_contrast_var + " + C(" + contraststring + ", Treatment)", data=df_final) res = mod.fit() print("With contrast (treatment coding)") print(res.summary()) if (o is not None): # concatenate data frames f = open(o, "a") f.write("\n" + full_model) f.write( "\n\n***********************************************************************************************************" ) f.write( "\n\n\n\nTreatment (Dummy) Coding: Dummy coding compares each level of the categorical variable to a base reference level. The base reference level is the value of the intercept." ) f.write("With contrast (treatment coding)") f.write(res.summary().as_text()) f.close() # Defining the Simple class def _name_levels(prefix, levels): return ["[%s%s]" % (prefix, level) for level in levels] class Simple(object): def _simple_contrast(self, levels): nlevels = len(levels) contr = -1. / nlevels * np.ones((nlevels, nlevels - 1)) contr[1:][np.diag_indices(nlevels - 1)] = (nlevels - 1.) / nlevels return contr def code_with_intercept(self, levels): c = np.column_stack( (np.ones(len(levels)), self._simple_contrast(levels))) return ContrastMatrix(c, _name_levels("Simp.", levels)) def code_without_intercept(self, levels): c = self._simple_contrast(levels) return ContrastMatrix(c, _name_levels("Simp.", levels[:-1])) ctrst = Simple().code_without_intercept(levels) mod = ols(dep_var + " ~ " + ind_vars_no_contrast_var + " + C(" + contraststring + ", Simple)", data=df_final) res = mod.fit() print( "\n\nSimple Coding: Like Treatment Coding, Simple Coding compares each level to a fixed reference level. However, with simple coding, the intercept is the grand mean of all the levels of the factors." ) print(res.summary()) if (o is not None): # concatenate data frames f = open(o, "a") f.write( "\n\n\nSimple Coding: Like Treatment Coding, Simple Coding compares each level to a fixed reference level. However, with simple coding, the intercept is the grand mean of all the levels of the factors." ) f.write(res.summary().as_text()) f.close() #With contrast (sum/deviation coding) ctrst = Sum().code_without_intercept(levels) mod = ols(dep_var + " ~ " + ind_vars_no_contrast_var + " + C(" + contraststring + ", Sum)", data=df_final) res = mod.fit() print( "\n\nSum (Deviation) Coding: Sum coding compares the mean of the dependent variable for a given level to the overall mean of the dependent variable over all the levels." ) print(res.summary()) if (o is not None): # concatenate data frames f = open(o, "a") f.write( "\n\n\nSum (Deviation) Coding: Sum coding compares the mean of the dependent variable for a given level to the overall mean of the dependent variable over all the levels." ) f.write(res.summary().as_text()) f.close() #With contrast (backward difference coding) ctrst = Diff().code_without_intercept(levels) mod = ols(dep_var + " ~ " + ind_vars_no_contrast_var + " + C(" + contraststring + ", Diff)", data=df_final) res = mod.fit() print( "\n\nBackward Difference Coding: In backward difference coding, the mean of the dependent variable for a level is compared with the mean of the dependent variable for the prior level." ) print(res.summary()) if (o is not None): # concatenate data frames f = open(o, "a") f.write( "\n\n\nBackward Difference Coding: In backward difference coding, the mean of the dependent variable for a level is compared with the mean of the dependent variable for the prior level." ) f.write(res.summary().as_text()) f.close() #With contrast (Helmert coding) ctrst = Helmert().code_without_intercept(levels) mod = ols(dep_var + " ~ " + ind_vars_no_contrast_var + " + C(" + contraststring + ", Helmert)", data=df_final) res = mod.fit() print( "\n\nHelmert Coding: Our version of Helmert coding is sometimes referred to as Reverse Helmert Coding. The mean of the dependent variable for a level is compared to the mean of the dependent variable over all previous levels. Hence, the name ‘reverse’ being sometimes applied to differentiate from forward Helmert coding." ) print(res.summary()) if (o is not None): # concatenate data frames f = open(o, "a") f.write( "\n\n\nHelmert Coding: Our version of Helmert coding is sometimes referred to as Reverse Helmert Coding. The mean of the dependent variable for a level is compared to the mean of the dependent variable over all previous levels. Hence, the name ‘reverse’ being sometimes applied to differentiate from forward Helmert coding." ) f.write(res.summary().as_text()) f.close()