def test_contrast(): from patsy.contrasts import ContrastMatrix, Sum values = ["a1", "a3", "a1", "a2"] # No intercept in model, full-rank coding of 'a' m = make_matrix({"a": C(values)}, 3, [["a"]], column_names=["a[a1]", "a[a2]", "a[a3]"]) assert np.allclose(m, [[1, 0, 0], [0, 0, 1], [1, 0, 0], [0, 1, 0]]) for s in (Sum, Sum()): m = make_matrix({"a": C(values, s)}, 3, [["a"]], column_names=["a[mean]", "a[S.a1]", "a[S.a2]"]) # Output from R assert np.allclose(m, [[1, 1, 0], [1, -1, -1], [1, 1, 0], [1, 0, 1]]) m = make_matrix({"a": C(values, Sum(omit=0))}, 3, [["a"]], column_names=["a[mean]", "a[S.a2]", "a[S.a3]"]) # Output from R assert np.allclose(m, [[1, -1, -1], [1, 0, 1], [1, -1, -1], [1, 1, 0]]) # Intercept in model, non-full-rank coding of 'a' m = make_matrix({"a": C(values)}, 3, [[], ["a"]], column_names=["Intercept", "a[T.a2]", "a[T.a3]"]) assert np.allclose(m, [[1, 0, 0], [1, 0, 1], [1, 0, 0], [1, 1, 0]]) for s in (Sum, Sum()): m = make_matrix({"a": C(values, s)}, 3, [[], ["a"]], column_names=["Intercept", "a[S.a1]", "a[S.a2]"]) # Output from R assert np.allclose(m, [[1, 1, 0], [1, -1, -1], [1, 1, 0], [1, 0, 1]]) m = make_matrix({"a": C(values, Sum(omit=0))}, 3, [[], ["a"]], column_names=["Intercept", "a[S.a2]", "a[S.a3]"]) # Output from R assert np.allclose(m, [[1, -1, -1], [1, 0, 1], [1, -1, -1], [1, 1, 0]]) # Weird ad hoc less-than-full-rank coding of 'a' m = make_matrix({"a": C(values, [[7, 12], [2, 13], [8, -1]])}, 2, [["a"]], column_names=["a[custom0]", "a[custom1]"]) assert np.allclose(m, [[7, 12], [8, -1], [7, 12], [2, 13]]) m = make_matrix( { "a": C(values, ContrastMatrix([[7, 12], [2, 13], [8, -1]], ["[foo]", "[bar]"])) }, 2, [["a"]], column_names=["a[foo]", "a[bar]"]) assert np.allclose(m, [[7, 12], [8, -1], [7, 12], [2, 13]])
def fit_sum_coding(col, values, handle_missing, handle_unknown): if handle_missing == 'value': values = values[values > 0] values_to_encode = values.get_values() if len(values) < 2: return pd.DataFrame(index=values_to_encode) if handle_unknown == 'indicator': values_to_encode = np.append(values_to_encode, -1) sum_contrast_matrix = Sum().code_without_intercept( values_to_encode.tolist()) df = pd.DataFrame( data=sum_contrast_matrix.matrix, index=values_to_encode, columns=[ str(col) + '_%d' % (i, ) for i in range(len(sum_contrast_matrix.column_suffixes)) ]) if handle_unknown == 'return_nan': df.loc[-1] = np.nan elif handle_unknown == 'value': df.loc[-1] = np.zeros(len(values_to_encode) - 1) if handle_missing == 'return_nan': df.loc[values.loc[np.nan]] = np.nan elif handle_missing == 'value': df.loc[-2] = np.zeros(len(values_to_encode) - 1) return df
def MakeLinearSinus(t, k=[1, 2, 3], trend=False, YAM=False): N = len(t) freq = 365 K = np.repeat(np.array([k]), N, axis=0) fix = 2 * np.pi / freq Fix = t.reshape(N, 1) * fix * K #print(Fix.shape) if trend: Xm = np.concatenate([ np.array([1] * N).reshape(N, 1), t.reshape(N, 1), np.sin(Fix), np.cos(Fix) ], axis=1) elif YAM: year = ((t / 365).astype("int")) year = Sum().code_without_intercept(list(set(year))).matrix[year, :] #year=sm.tools.categorical(year, drop=True) Xm = np.concatenate( [np.array([1] * N).reshape(N, 1), year, np.sin(Fix), np.cos(Fix)], axis=1) else: Xm = np.concatenate( [np.array([1] * N).reshape(N, 1), np.sin(Fix), np.cos(Fix)], axis=1) return Xm
def fit_sum_coding(values): if len(values) < 2: return pd.DataFrame() sum_contrast_matrix = Sum().code_without_intercept(values) df = pd.DataFrame(data=sum_contrast_matrix.matrix, columns=sum_contrast_matrix.column_suffixes) df.index += 1 df.loc[0] = np.zeros(len(values) - 1) return df
def remove_batch_effect(X, batches, coefs=None): """Python version of limma::removeBatchEffect. This should duplicate the original R code here (for the case where there is only a single vector of batches): https://rdrr.io/bioc/limma/src/R/removeBatchEffect.R For now, batches needs to be integer indexes. If coefs are provided, they should be an m x p vector, where m is the dimension of the design matrix and p is the number of features in the original dataset. """ from patsy.contrasts import Sum # use sum coding to code batches, this is what limma does # https://www.statsmodels.org/dev/examples/notebooks/generated/contrasts.html#Sum-(Deviation)-Coding # this is a bit easier/more intuitive in R, due to its built-in factor # type, but we can sort of emulate it here with pandas categorical data batches_df = pd.Series(batches, dtype='category') contrast = Sum().code_without_intercept(list(batches_df.cat.categories)) design = contrast.matrix[batches.astype(int), :] # if coefficients are provided, just use them to correct the provided data # otherwise fit the model and correct the provided data if coefs is None: from sklearn.linear_model import LinearRegression # X is an n x p matrix # batches is a n x m vector of batch indicators # we want to find a m x p vector of coefficients reg = LinearRegression().fit(design, X) # per sklearn documentation, for multiple targets the coef_ is # always an (n_targets, n_features) array (i.e. m x p) assert reg.coef_.shape == (X.shape[1], design.shape[1]) coefs = reg.coef_ return X - (design.astype(float) @ coefs.T), coefs
print(contrast.matrix) mod = ols("write ~ C(race, Simple)", data=hsb2) res = mod.fit() print(res.summary()) # ### Sum (Deviation) Coding # Sum coding compares the mean of the dependent variable for a given level # to the overall mean of the dependent variable over all the levels. That # is, it uses contrasts between each of the first k-1 levels and level k In # this example, level 1 is compared to all the others, level 2 to all the # others, and level 3 to all the others. from patsy.contrasts import Sum contrast = Sum().code_without_intercept(levels) print(contrast.matrix) mod = ols("write ~ C(race, Sum)", data=hsb2) res = mod.fit() print(res.summary()) # This corresponds to a parameterization that forces all the coefficients # to sum to zero. Notice that the intercept here is the grand mean where the # grand mean is the mean of means of the dependent variable by each level. hsb2.groupby('race')['write'].mean().mean() # ### Backward Difference Coding # In backward difference coding, the mean of the dependent variable for a
def contrasting(): global c if c: #to account for multiple contrast variables contrastvars = [] if "," in c: contrastvars = c.split(",") for i in range(len(contrastvars)): contrastvars[i] = contrastvars[i].strip() if " " in contrastvars[i]: contrastvars[i] = contrastvars[i].replace(" ", "_") if "/" in contrastvars[i]: #to account for URLs splitted = contrastvars[i].split("/") contrastvars[i] = splitted[len(splitted) - 1] else: splitted = c.split("/") #to account for URLs c = splitted[len(splitted) - 1] ind_vars_no_contrast_var = '' index = 1 for i in range(len(full_model_variable_list)): if "/" in full_model_variable_list[i]: splitted = full_model_variable_list[i].split("/") full_model_variable_list[i] = splitted[len(splitted) - 1] if " " in full_model_variable_list[i]: full_model_variable_list[i] = full_model_variable_list[ i].replace(" ", "_") for var in full_model_variable_list: if var != c and not (var in contrastvars): if index == 1: ind_vars_no_contrast_var = var index += 1 else: ind_vars_no_contrast_var = ind_vars_no_contrast_var + " + " + var if len(contrastvars) > 0: contraststring = ' + '.join(contrastvars) else: if " " in c: c = c.replace(" ", "_") contraststring = c # With contrast (treatment coding) print( "\n\nTreatment (Dummy) Coding: Dummy coding compares each level of the categorical variable to a base reference level. The base reference level is the value of the intercept." ) ctrst = Treatment(reference=0).code_without_intercept(levels) mod = ols(dep_var + " ~ " + ind_vars_no_contrast_var + " + C(" + contraststring + ", Treatment)", data=df_final) res = mod.fit() print("With contrast (treatment coding)") print(res.summary()) if (o is not None): # concatenate data frames f = open(o, "a") f.write("\n" + full_model) f.write( "\n\n***********************************************************************************************************" ) f.write( "\n\n\n\nTreatment (Dummy) Coding: Dummy coding compares each level of the categorical variable to a base reference level. The base reference level is the value of the intercept." ) f.write("With contrast (treatment coding)") f.write(res.summary().as_text()) f.close() # Defining the Simple class def _name_levels(prefix, levels): return ["[%s%s]" % (prefix, level) for level in levels] class Simple(object): def _simple_contrast(self, levels): nlevels = len(levels) contr = -1. / nlevels * np.ones((nlevels, nlevels - 1)) contr[1:][np.diag_indices(nlevels - 1)] = (nlevels - 1.) / nlevels return contr def code_with_intercept(self, levels): c = np.column_stack( (np.ones(len(levels)), self._simple_contrast(levels))) return ContrastMatrix(c, _name_levels("Simp.", levels)) def code_without_intercept(self, levels): c = self._simple_contrast(levels) return ContrastMatrix(c, _name_levels("Simp.", levels[:-1])) ctrst = Simple().code_without_intercept(levels) mod = ols(dep_var + " ~ " + ind_vars_no_contrast_var + " + C(" + contraststring + ", Simple)", data=df_final) res = mod.fit() print( "\n\nSimple Coding: Like Treatment Coding, Simple Coding compares each level to a fixed reference level. However, with simple coding, the intercept is the grand mean of all the levels of the factors." ) print(res.summary()) if (o is not None): # concatenate data frames f = open(o, "a") f.write( "\n\n\nSimple Coding: Like Treatment Coding, Simple Coding compares each level to a fixed reference level. However, with simple coding, the intercept is the grand mean of all the levels of the factors." ) f.write(res.summary().as_text()) f.close() #With contrast (sum/deviation coding) ctrst = Sum().code_without_intercept(levels) mod = ols(dep_var + " ~ " + ind_vars_no_contrast_var + " + C(" + contraststring + ", Sum)", data=df_final) res = mod.fit() print( "\n\nSum (Deviation) Coding: Sum coding compares the mean of the dependent variable for a given level to the overall mean of the dependent variable over all the levels." ) print(res.summary()) if (o is not None): # concatenate data frames f = open(o, "a") f.write( "\n\n\nSum (Deviation) Coding: Sum coding compares the mean of the dependent variable for a given level to the overall mean of the dependent variable over all the levels." ) f.write(res.summary().as_text()) f.close() #With contrast (backward difference coding) ctrst = Diff().code_without_intercept(levels) mod = ols(dep_var + " ~ " + ind_vars_no_contrast_var + " + C(" + contraststring + ", Diff)", data=df_final) res = mod.fit() print( "\n\nBackward Difference Coding: In backward difference coding, the mean of the dependent variable for a level is compared with the mean of the dependent variable for the prior level." ) print(res.summary()) if (o is not None): # concatenate data frames f = open(o, "a") f.write( "\n\n\nBackward Difference Coding: In backward difference coding, the mean of the dependent variable for a level is compared with the mean of the dependent variable for the prior level." ) f.write(res.summary().as_text()) f.close() #With contrast (Helmert coding) ctrst = Helmert().code_without_intercept(levels) mod = ols(dep_var + " ~ " + ind_vars_no_contrast_var + " + C(" + contraststring + ", Helmert)", data=df_final) res = mod.fit() print( "\n\nHelmert Coding: Our version of Helmert coding is sometimes referred to as Reverse Helmert Coding. The mean of the dependent variable for a level is compared to the mean of the dependent variable over all previous levels. Hence, the name ‘reverse’ being sometimes applied to differentiate from forward Helmert coding." ) print(res.summary()) if (o is not None): # concatenate data frames f = open(o, "a") f.write( "\n\n\nHelmert Coding: Our version of Helmert coding is sometimes referred to as Reverse Helmert Coding. The mean of the dependent variable for a level is compared to the mean of the dependent variable over all previous levels. Hence, the name ‘reverse’ being sometimes applied to differentiate from forward Helmert coding." ) f.write(res.summary().as_text()) f.close()