def fit_polynomial_coding(col, values, handle_missing, handle_unknown):
        if handle_missing == 'value':
            values = values[values > 0]

        values_to_encode = values.get_values()

        if len(values) < 2:
            return pd.DataFrame(index=values_to_encode)

        if handle_unknown == 'indicator':
            values_to_encode = np.append(values_to_encode, -1)

        polynomial_contrast_matrix = Poly().code_without_intercept(
            values_to_encode)
        df = pd.DataFrame(
            data=polynomial_contrast_matrix.matrix,
            index=values_to_encode,
            columns=[
                str(col) + '_%d' % (i, )
                for i in range(len(polynomial_contrast_matrix.column_suffixes))
            ])

        if handle_unknown == 'return_nan':
            df.loc[-1] = np.nan
        elif handle_unknown == 'value':
            df.loc[-1] = np.zeros(len(values_to_encode) - 1)

        if handle_missing == 'return_nan':
            df.loc[values.loc[np.nan]] = np.nan
        elif handle_missing == 'value':
            df.loc[-2] = np.zeros(len(values_to_encode) - 1)

        return df
Example #2
0
    def fit_polynomial_coding(values):
        if len(values) < 2:
            return pd.DataFrame()

        polynomial_contrast_matrix = Poly().code_without_intercept(values)
        df = pd.DataFrame(data=polynomial_contrast_matrix.matrix,
                          columns=polynomial_contrast_matrix.column_suffixes)
        df.index += 1
        df.loc[0] = np.zeros(len(values) - 1)
        return df
Example #3
0
k = 4
1. / k * (grouped.mean()["write"][k] - grouped.mean()["write"][:k - 1].mean())
k = 3
1. / k * (grouped.mean()["write"][k] - grouped.mean()["write"][:k - 1].mean())

# ### Orthogonal Polynomial Coding

# The coefficients taken on by polynomial coding for `k=4` levels are the
# linear, quadratic, and cubic trends in the categorical variable. The
# categorical variable here is assumed to be represented by an underlying,
# equally spaced numeric variable. Therefore, this type of encoding is used
# only for ordered categorical variables with equal spacing. In general, the
# polynomial contrast produces polynomials of order `k-1`. Since `race` is
# not an ordered factor variable let's use `read` as an example. First we
# need to create an ordered categorical from `read`.

hsb2['readcat'] = np.asarray(pd.cut(hsb2.read, bins=3))
hsb2.groupby('readcat').mean()['write']

from patsy.contrasts import Poly
levels = hsb2.readcat.unique().tolist()
contrast = Poly().code_without_intercept(levels)
print(contrast.matrix)

mod = ols("write ~ C(readcat, Poly)", data=hsb2)
res = mod.fit()
print(res.summary())

# As you can see, readcat has a significant linear effect on the dependent
# variable `write` but not a significant quadratic or cubic effect.