def fit_polynomial_coding(col, values, handle_missing, handle_unknown): if handle_missing == 'value': values = values[values > 0] values_to_encode = values.get_values() if len(values) < 2: return pd.DataFrame(index=values_to_encode) if handle_unknown == 'indicator': values_to_encode = np.append(values_to_encode, -1) polynomial_contrast_matrix = Poly().code_without_intercept( values_to_encode) df = pd.DataFrame( data=polynomial_contrast_matrix.matrix, index=values_to_encode, columns=[ str(col) + '_%d' % (i, ) for i in range(len(polynomial_contrast_matrix.column_suffixes)) ]) if handle_unknown == 'return_nan': df.loc[-1] = np.nan elif handle_unknown == 'value': df.loc[-1] = np.zeros(len(values_to_encode) - 1) if handle_missing == 'return_nan': df.loc[values.loc[np.nan]] = np.nan elif handle_missing == 'value': df.loc[-2] = np.zeros(len(values_to_encode) - 1) return df
def fit_polynomial_coding(values): if len(values) < 2: return pd.DataFrame() polynomial_contrast_matrix = Poly().code_without_intercept(values) df = pd.DataFrame(data=polynomial_contrast_matrix.matrix, columns=polynomial_contrast_matrix.column_suffixes) df.index += 1 df.loc[0] = np.zeros(len(values) - 1) return df
k = 4 1. / k * (grouped.mean()["write"][k] - grouped.mean()["write"][:k - 1].mean()) k = 3 1. / k * (grouped.mean()["write"][k] - grouped.mean()["write"][:k - 1].mean()) # ### Orthogonal Polynomial Coding # The coefficients taken on by polynomial coding for `k=4` levels are the # linear, quadratic, and cubic trends in the categorical variable. The # categorical variable here is assumed to be represented by an underlying, # equally spaced numeric variable. Therefore, this type of encoding is used # only for ordered categorical variables with equal spacing. In general, the # polynomial contrast produces polynomials of order `k-1`. Since `race` is # not an ordered factor variable let's use `read` as an example. First we # need to create an ordered categorical from `read`. hsb2['readcat'] = np.asarray(pd.cut(hsb2.read, bins=3)) hsb2.groupby('readcat').mean()['write'] from patsy.contrasts import Poly levels = hsb2.readcat.unique().tolist() contrast = Poly().code_without_intercept(levels) print(contrast.matrix) mod = ols("write ~ C(readcat, Poly)", data=hsb2) res = mod.fit() print(res.summary()) # As you can see, readcat has a significant linear effect on the dependent # variable `write` but not a significant quadratic or cubic effect.