def _generate_rhs(contrasts, columns_var_gt_0) -> list[Term]: rhs = [Term([])] # force intercept for contrast in contrasts: if contrast["type"] == "infer": if not columns_var_gt_0[contrast["variable"]].all(): logger.warning( f'Not adding term "{contrast["variable"]}" to design matrix ' "because it has zero variance") continue # for every term in the model a contrast of type infer needs to be specified rhs.append( Term([LookupFactor(name) for name in contrast["variable"]])) return rhs
def make_termlist(*entries): terms = [] for entry in entries: terms.append(Term([LookupFactor(name) for name in entry])) return terms
def test_formula_likes(): # Plain array-like, rhs only t([[1, 2, 3], [4, 5, 6]], {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"]) t((None, [[1, 2, 3], [4, 5, 6]]), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"]) t(np.asarray([[1, 2, 3], [4, 5, 6]]), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"]) t((None, np.asarray([[1, 2, 3], [4, 5, 6]])), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"]) dm = DesignMatrix([[1, 2, 3], [4, 5, 6]], default_column_prefix="foo") t(dm, {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["foo0", "foo1", "foo2"]) t((None, dm), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["foo0", "foo1", "foo2"]) # Plain array-likes, lhs and rhs t(([1, 2], [[1, 2, 3], [4, 5, 6]]), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"], [[1], [2]], ["y0"]) t(([[1], [2]], [[1, 2, 3], [4, 5, 6]]), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"], [[1], [2]], ["y0"]) t((np.asarray([1, 2]), np.asarray([[1, 2, 3], [4, 5, 6]])), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"], [[1], [2]], ["y0"]) t((np.asarray([[1], [2]]), np.asarray([[1, 2, 3], [4, 5, 6]])), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["x0", "x1", "x2"], [[1], [2]], ["y0"]) x_dm = DesignMatrix([[1, 2, 3], [4, 5, 6]], default_column_prefix="foo") y_dm = DesignMatrix([1, 2], default_column_prefix="bar") t((y_dm, x_dm), {}, 0, False, [[1, 2, 3], [4, 5, 6]], ["foo0", "foo1", "foo2"], [[1], [2]], ["bar0"]) # number of rows must match t_invalid(([1, 2, 3], [[1, 2, 3], [4, 5, 6]]), {}, 0) # tuples must have the right size t_invalid(([[1, 2, 3]], ), {}, 0) t_invalid(([[1, 2, 3]], [[1, 2, 3]], [[1, 2, 3]]), {}, 0) # plain Series and DataFrames if have_pandas: # Names are extracted t(pandas.DataFrame({"x": [1, 2, 3]}), {}, 0, False, [[1], [2], [3]], ["x"]) t(pandas.Series([1, 2, 3], name="asdf"), {}, 0, False, [[1], [2], [3]], ["asdf"]) t((pandas.DataFrame({"y": [4, 5, 6] }), pandas.DataFrame({"x": [1, 2, 3]})), {}, 0, False, [[1], [2], [3]], ["x"], [[4], [5], [6]], ["y"]) t((pandas.Series([4, 5, 6], name="y"), pandas.Series([1, 2, 3], name="x")), {}, 0, False, [[1], [2], [3]], ["x"], [[4], [5], [6]], ["y"]) # Or invented t((pandas.DataFrame([[4, 5, 6]]), pandas.DataFrame([[1, 2, 3]], columns=[7, 8, 9])), {}, 0, False, [[1, 2, 3]], ["x7", "x8", "x9"], [[4, 5, 6]], ["y0", "y1", "y2"]) t(pandas.Series([1, 2, 3]), {}, 0, False, [[1], [2], [3]], ["x0"]) # indices must match t_invalid((pandas.DataFrame( [[1]], index=[1]), pandas.DataFrame([[1]], index=[2])), {}, 0) # Foreign ModelDesc factories class ForeignModelSource(object): def __patsy_get_model_desc__(self, data): return ModelDesc([Term([LookupFactor("Y")])], [Term([LookupFactor("X")])]) foreign_model = ForeignModelSource() t(foreign_model, { "Y": [1, 2], "X": [[1, 2], [3, 4]] }, 0, True, [[1, 2], [3, 4]], ["X[0]", "X[1]"], [[1], [2]], ["Y"]) class BadForeignModelSource(object): def __patsy_get_model_desc__(self, data): return data t_invalid(BadForeignModelSource(), {}, 0) # string formulas t("y ~ x", { "y": [1, 2], "x": [3, 4] }, 0, True, [[1, 3], [1, 4]], ["Intercept", "x"], [[1], [2]], ["y"]) t("~ x", { "y": [1, 2], "x": [3, 4] }, 0, True, [[1, 3], [1, 4]], ["Intercept", "x"]) t("x + y", { "y": [1, 2], "x": [3, 4] }, 0, True, [[1, 3, 1], [1, 4, 2]], ["Intercept", "x", "y"]) # unicode objects on py2 (must be ascii only) if not six.PY3: # ascii is fine t(unicode("y ~ x"), { "y": [1, 2], "x": [3, 4] }, 0, True, [[1, 3], [1, 4]], ["Intercept", "x"], [[1], [2]], ["y"]) # non-ascii is not (even if this would be valid on py3 with its less # restrict variable naming rules) eacute = "\xc3\xa9".decode("utf-8") assert isinstance(eacute, unicode) assert_raises(PatsyError, dmatrix, eacute, data={eacute: [1, 2]}) # ModelDesc desc = ModelDesc([], [Term([LookupFactor("x")])]) t(desc, {"x": [1.5, 2.5, 3.5]}, 0, True, [[1.5], [2.5], [3.5]], ["x"]) desc = ModelDesc([], [Term([]), Term([LookupFactor("x")])]) t(desc, {"x": [1.5, 2.5, 3.5]}, 0, True, [[1, 1.5], [1, 2.5], [1, 3.5]], ["Intercept", "x"]) desc = ModelDesc([Term([LookupFactor("y")])], [Term([]), Term([LookupFactor("x")])]) t(desc, { "x": [1.5, 2.5, 3.5], "y": [10, 20, 30] }, 0, True, [[1, 1.5], [1, 2.5], [1, 3.5]], ["Intercept", "x"], [[10], [20], [30]], ["y"]) # builders termlists = ( [], [Term([LookupFactor("x")])], [Term([]), Term([LookupFactor("x")])], ) builders = design_matrix_builders(termlists, lambda: iter([{ "x": [1, 2, 3] }]), eval_env=0) # twople but with no LHS t((builders[0], builders[2]), {"x": [10, 20, 30]}, 0, True, [[1, 10], [1, 20], [1, 30]], ["Intercept", "x"]) # single DesignInfo t(builders[2], {"x": [10, 20, 30]}, 0, True, [[1, 10], [1, 20], [1, 30]], ["Intercept", "x"]) # twople with LHS t((builders[1], builders[2]), {"x": [10, 20, 30]}, 0, True, [[1, 10], [1, 20], [1, 30]], ["Intercept", "x"], [[10], [20], [30]], ["x"]) # check depth arguments x_in_env = [1, 2, 3] t("~ x_in_env", {}, 0, True, [[1, 1], [1, 2], [1, 3]], ["Intercept", "x_in_env"]) t("~ x_in_env", {"x_in_env": [10, 20, 30]}, 0, True, [[1, 10], [1, 20], [1, 30]], ["Intercept", "x_in_env"]) # Trying to pull x_in_env out of our *caller* shouldn't work. t_invalid("~ x_in_env", {}, 1, exc=(NameError, PatsyError)) # But then again it should, if called from one down on the stack: def check_nested_call(): x_in_env = "asdf" t("~ x_in_env", {}, 1, True, [[1, 1], [1, 2], [1, 3]], ["Intercept", "x_in_env"]) check_nested_call() # passing in an explicit EvalEnvironment also works: e = EvalEnvironment.capture(1) t_invalid("~ x_in_env", {}, e, exc=(NameError, PatsyError)) e = EvalEnvironment.capture(0) def check_nested_call_2(): x_in_env = "asdf" t("~ x_in_env", {}, e, True, [[1, 1], [1, 2], [1, 3]], ["Intercept", "x_in_env"]) check_nested_call_2()
def __patsy_get_model_desc__(self, data): return ModelDesc([Term([LookupFactor("Y")])], [Term([LookupFactor("X")])])
print(data[["Label", "f1", "f2", data.columns[-1]]].head()) ################################################### # Let's train a logistic regression. formula = "Label ~ {0}".format(" + ".join(data.columns[1:])) print(formula[:50] + " + ...") from microsoftml import rx_logistic_regression try: logregml = rx_logistic_regression(formula, data=data) except Exception as e: # The error is expected because patsy cannot handle # so many features. print(e) ######################################### # Let's skip patsy's parser to manually define the formula # with object `ModelDesc <http://patsy.readthedocs.io/en/latest/API-reference.html?highlight=lookupfactor#patsy.ModelDesc>`_. from patsy.desc import ModelDesc, Term from patsy.user_util import LookupFactor patsy_features = [Term([LookupFactor(n)]) for n in data.columns[1:]][:10] model_formula = ModelDesc([Term([LookupFactor("Label")])], [Term([])] + patsy_features) print(model_formula.describe() + " + ...") logregml = rx_logistic_regression(model_formula, data=data)