def test_term_info(): data = balanced(a=2, b=2) rhs = dmatrix("a:b", data) assert rhs.design_info.column_names == ["Intercept", "b[T.b2]", "a[T.a2]:b[b1]", "a[T.a2]:b[b2]"] assert rhs.design_info.term_names == ["Intercept", "a:b"] assert len(rhs.design_info.terms) == 2 assert rhs.design_info.terms[0] == INTERCEPT
def test_categorical(): data = balanced(a=2, b=2) # There are more exhaustive tests for all the different coding options in # test_build; let's just make sure that C() and stuff works. t("~ C(a)", data, 0, True, [[1, 0], [1, 0], [1, 1], [1, 1]], ["Intercept", "C(a)[T.a2]"]) t("~ C(a, levels=['a2', 'a1'])", data, 0, True, [[1, 1], [1, 1], [1, 0], [1, 0]], ["Intercept", "C(a, levels=['a2', 'a1'])[T.a1]"]) t("~ C(a, Treatment(reference=-1))", data, 0, True, [[1, 1], [1, 1], [1, 0], [1, 0]], ["Intercept", "C(a, Treatment(reference=-1))[T.a1]"]) # Different interactions t("a*b", data, 0, True, [[1, 0, 0, 0], [1, 0, 1, 0], [1, 1, 0, 0], [1, 1, 1, 1]], ["Intercept", "a[T.a2]", "b[T.b2]", "a[T.a2]:b[T.b2]"]) t("0 + a:b", data, 0, True, [[1, 0, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 0, 1]], ["a[a1]:b[b1]", "a[a2]:b[b1]", "a[a1]:b[b2]", "a[a2]:b[b2]"]) t("1 + a + a:b", data, 0, True, [[1, 0, 0, 0], [1, 0, 1, 0], [1, 1, 0, 0], [1, 1, 0, 1]], ["Intercept", "a[T.a2]", "a[a1]:b[T.b2]", "a[a2]:b[T.b2]"]) # Changing contrast with C() data["a"] = C(data["a"], Helmert) t("a", data, 0, True, [[1, -1], [1, -1], [1, 1], [1, 1]], ["Intercept", "a[H.a2]"]) t("C(a, Treatment)", data, 0, True, [[1, 0], [1, 0], [1, 1], [1, 1]], ["Intercept", "C(a, Treatment)[T.a2]"]) # That didn't affect the original object t("a", data, 0, True, [[1, -1], [1, -1], [1, 1], [1, 1]], ["Intercept", "a[H.a2]"])
def test_simple(): data = balanced(a=2, b=2) x1 = data["x1"] = np.linspace(0, 1, len(data["a"])) x2 = data["x2"] = data["x1"] ** 2 m = make_matrix(data, 2, [["a"]], column_names=["a[a1]", "a[a2]"]) assert np.allclose(m, [[1, 0], [1, 0], [0, 1], [0, 1]]) m = make_matrix(data, 2, [[], ["a"]], column_names=["Intercept", "a[T.a2]"]) assert np.allclose(m, [[1, 0], [1, 0], [1, 1], [1, 1]]) m = make_matrix(data, 4, [["a", "b"]], column_names=["a[a1]:b[b1]", "a[a2]:b[b1]", "a[a1]:b[b2]", "a[a2]:b[b2]"]) assert np.allclose(m, [[1, 0, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 0, 1]]) m = make_matrix( data, 4, [[], ["a"], ["b"], ["a", "b"]], column_names=["Intercept", "a[T.a2]", "b[T.b2]", "a[T.a2]:b[T.b2]"] ) assert np.allclose(m, [[1, 0, 0, 0], [1, 0, 1, 0], [1, 1, 0, 0], [1, 1, 1, 1]]) m = make_matrix( data, 4, [[], ["b"], ["a"], ["b", "a"]], column_names=["Intercept", "b[T.b2]", "a[T.a2]", "b[T.b2]:a[T.a2]"] ) assert np.allclose(m, [[1, 0, 0, 0], [1, 1, 0, 0], [1, 0, 1, 0], [1, 1, 1, 1]]) m = make_matrix(data, 4, [["a"], ["x1"], ["a", "x1"]], column_names=["a[a1]", "a[a2]", "x1", "a[T.a2]:x1"]) assert np.allclose(m, [[1, 0, x1[0], 0], [1, 0, x1[1], 0], [0, 1, x1[2], x1[2]], [0, 1, x1[3], x1[3]]]) m = make_matrix(data, 3, [["x1"], ["x2"], ["x2", "x1"]], column_names=["x1", "x2", "x2:x1"]) assert np.allclose(m, np.column_stack((x1, x2, x1 * x2)))
def test_redundancy_thoroughly(): # To make sure there aren't any lurking bugs analogous to the ones that R # has (see above), we check that we get the correct matrix rank for every # possible combination of 2 categorical and 2 numerical factors. data = balanced(a=2, b=2, repeat=5) data["x1"] = np.linspace(0, 1, len(data["a"])) data["x2"] = data["x1"] ** 2 def all_subsets(l): if not l: yield tuple() else: obj = l[0] for subset in all_subsets(l[1:]): yield tuple(sorted(subset)) yield tuple(sorted((obj,) + subset)) all_terms = list(all_subsets(("a", "b", "x1", "x2"))) all_termlist_templates = list(all_subsets(all_terms)) print(len(all_termlist_templates)) # eliminate some of the symmetric versions to speed things up redundant = [ [("b",), ("a",)], [("x2",), ("x1",)], [("b", "x2"), ("a", "x1")], [("a", "b", "x2"), ("a", "b", "x1")], [("b", "x1", "x2"), ("a", "x1", "x2")], ] count = 0 import time start = time.time() for termlist_template in all_termlist_templates: termlist_set = set(termlist_template) for dispreferred, preferred in redundant: if dispreferred in termlist_set and preferred not in termlist_set: break else: expanded_terms = set() for term_template in termlist_template: numeric = tuple([t for t in term_template if t.startswith("x")]) rest = [t for t in term_template if not t.startswith("x")] for subset_rest in all_subsets(rest): expanded_terms.add(frozenset(subset_rest + numeric)) # Because our categorical variables have 2 levels, each expanded # term corresponds to 1 unique dimension of variation expected_rank = len(expanded_terms) if termlist_template in [(), ((),)]: # No data dependence, should fail assert_raises(PatsyError, make_matrix, data, expected_rank, termlist_template) else: make_matrix(data, expected_rank, termlist_template) count += 1 if count % 100 == 0: print("Completed:", count) print("Took %0.2f seconds" % (time.time() - start,))
def test_redundancy_thoroughly(): # To make sure there aren't any lurking bugs analogous to the ones that R # has (see above), we check that we get the correct matrix rank for every # possible combination of 2 categorical and 2 numerical factors. data = balanced(a=2, b=2, repeat=5) data["x1"] = np.linspace(0, 1, len(data["a"])) data["x2"] = data["x1"] ** 2 def all_subsets(l): if not l: yield tuple() else: obj = l[0] for subset in all_subsets(l[1:]): yield tuple(sorted(subset)) yield tuple(sorted((obj,) + subset)) all_terms = list(all_subsets(("a", "b", "x1", "x2"))) all_termlist_templates = list(all_subsets(all_terms)) print(len(all_termlist_templates)) # eliminate some of the symmetric versions to speed things up redundant = [[("b",), ("a",)], [("x2",), ("x1",)], [("b", "x2"), ("a", "x1")], [("a", "b", "x2"), ("a", "b", "x1")], [("b", "x1", "x2"), ("a", "x1", "x2")]] count = 0 import time start = time.time() for termlist_template in all_termlist_templates: termlist_set = set(termlist_template) for dispreferred, preferred in redundant: if dispreferred in termlist_set and preferred not in termlist_set: break else: expanded_terms = set() for term_template in termlist_template: numeric = tuple([t for t in term_template if t.startswith("x")]) rest = [t for t in term_template if not t.startswith("x")] for subset_rest in all_subsets(rest): expanded_terms.add(frozenset(subset_rest + numeric)) # Because our categorical variables have 2 levels, each expanded # term corresponds to 1 unique dimension of variation expected_rank = len(expanded_terms) if termlist_template in [(), ((),)]: # No data dependence, should fail assert_raises(PatsyError, make_matrix, data, expected_rank, termlist_template) else: make_matrix(data, expected_rank, termlist_template) count += 1 if count % 100 == 0: print("Completed:", count) print("Took %0.2f seconds" % (time.time() - start,))
def test_categorical(): data = balanced(a=2, b=2) # There are more exhaustive tests for all the different coding options in # test_build; let's just make sure that C() and stuff works. t("~ C(a)", data, 0, True, [[1, 0], [1, 0], [1, 1], [1, 1]], ["Intercept", "C(a)[T.a2]"]) t( "~ C(a, levels=['a2', 'a1'])", data, 0, True, [[1, 1], [1, 1], [1, 0], [1, 0]], ["Intercept", "C(a, levels=['a2', 'a1'])[T.a1]"], ) t( "~ C(a, Treatment(reference=-1))", data, 0, True, [[1, 1], [1, 1], [1, 0], [1, 0]], ["Intercept", "C(a, Treatment(reference=-1))[T.a1]"], ) # Different interactions t( "a*b", data, 0, True, [[1, 0, 0, 0], [1, 0, 1, 0], [1, 1, 0, 0], [1, 1, 1, 1]], ["Intercept", "a[T.a2]", "b[T.b2]", "a[T.a2]:b[T.b2]"], ) t( "0 + a:b", data, 0, True, [[1, 0, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 0, 1]], ["a[a1]:b[b1]", "a[a2]:b[b1]", "a[a1]:b[b2]", "a[a2]:b[b2]"], ) t( "1 + a + a:b", data, 0, True, [[1, 0, 0, 0], [1, 0, 1, 0], [1, 1, 0, 0], [1, 1, 0, 1]], ["Intercept", "a[T.a2]", "a[a1]:b[T.b2]", "a[a2]:b[T.b2]"], ) # Changing contrast with C() data["a"] = C(data["a"], Helmert) t("a", data, 0, True, [[1, -1], [1, -1], [1, 1], [1, 1]], ["Intercept", "a[H.a2]"]) t("C(a, Treatment)", data, 0, True, [[1, 0], [1, 0], [1, 1], [1, 1]], ["Intercept", "C(a, Treatment)[T.a2]"]) # That didn't affect the original object t("a", data, 0, True, [[1, -1], [1, -1], [1, 1], [1, 1]], ["Intercept", "a[H.a2]"])
def test_simple(): data = balanced(a=2, b=2) x1 = data["x1"] = np.linspace(0, 1, len(data["a"])) x2 = data["x2"] = data["x1"]**2 m = make_matrix(data, 2, [["a"]], column_names=["a[a1]", "a[a2]"]) assert np.allclose(m, [[1, 0], [1, 0], [0, 1], [0, 1]]) m = make_matrix(data, 2, [[], ["a"]], column_names=["Intercept", "a[T.a2]"]) assert np.allclose(m, [[1, 0], [1, 0], [1, 1], [1, 1]]) m = make_matrix(data, 4, [["a", "b"]], column_names=[ "a[a1]:b[b1]", "a[a2]:b[b1]", "a[a1]:b[b2]", "a[a2]:b[b2]" ]) assert np.allclose( m, [[1, 0, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 0, 1]]) m = make_matrix( data, 4, [[], ["a"], ["b"], ["a", "b"]], column_names=["Intercept", "a[T.a2]", "b[T.b2]", "a[T.a2]:b[T.b2]"]) assert np.allclose( m, [[1, 0, 0, 0], [1, 0, 1, 0], [1, 1, 0, 0], [1, 1, 1, 1]]) m = make_matrix( data, 4, [[], ["b"], ["a"], ["b", "a"]], column_names=["Intercept", "b[T.b2]", "a[T.a2]", "b[T.b2]:a[T.a2]"]) assert np.allclose( m, [[1, 0, 0, 0], [1, 1, 0, 0], [1, 0, 1, 0], [1, 1, 1, 1]]) m = make_matrix(data, 4, [["a"], ["x1"], ["a", "x1"]], column_names=["a[a1]", "a[a2]", "x1", "a[T.a2]:x1"]) assert np.allclose(m, [[1, 0, x1[0], 0], [1, 0, x1[1], 0], [0, 1, x1[2], x1[2]], [0, 1, x1[3], x1[3]]]) m = make_matrix(data, 3, [["x1"], ["x2"], ["x2", "x1"]], column_names=["x1", "x2", "x2:x1"]) assert np.allclose(m, np.column_stack((x1, x2, x1 * x2)))
def test_R_bugs(): data = balanced(a=2, b=2, c=2) data["x"] = np.linspace(0, 1, len(data["a"])) # For "1 + a:b", R produces a design matrix with too many columns (5 # instead of 4), because it can't tell that there is a redundancy between # the two terms. make_matrix(data, 4, [[], ["a", "b"]]) # For "0 + a:x + a:b", R produces a design matrix with too few columns (4 # instead of 6), because it thinks that there is a redundancy which # doesn't exist. make_matrix(data, 6, [["a", "x"], ["a", "b"]]) # This can be compared with "0 + a:c + a:b", where the redundancy does # exist. Confusingly, adding another categorical factor increases the # baseline dimensionality to 8, and then the redundancy reduces it to 6 # again, so the result is the same as before but for different reasons. (R # does get this one right, but we might as well test it.) make_matrix(data, 6, [["a", "c"], ["a", "b"]])
def test_term_order(): data = balanced(a=2, b=2) data["x1"] = np.linspace(0, 1, 4) data["x2"] = data["x1"]**2 def t_terms(formula, order): m = dmatrix(formula, data) assert m.design_info.term_names == order t_terms("a + b + x1 + x2", ["Intercept", "a", "b", "x1", "x2"]) t_terms("b + a + x2 + x1", ["Intercept", "b", "a", "x2", "x1"]) t_terms("0 + x1 + a + x2 + b + 1", ["Intercept", "a", "b", "x1", "x2"]) t_terms("0 + a:b + a + b + 1", ["Intercept", "a", "b", "a:b"]) t_terms("a + a:x1 + x2 + x1 + b", ["Intercept", "a", "b", "x1", "a:x1", "x2"]) t_terms( "0 + a:x1:x2 + a + x2:x1:b + x2 + x1 + a:x1 + x1:x2 + x1:a:x2:a:b", ["a", "x1:x2", "a:x1:x2", "x2:x1:b", "x1:a:x2:b", "x2", "x1", "a:x1"])
def test_term_order(): data = balanced(a=2, b=2) data["x1"] = np.linspace(0, 1, 4) data["x2"] = data["x1"] ** 2 def t_terms(formula, order): m = dmatrix(formula, data) assert m.design_info.term_names == order t_terms("a + b + x1 + x2", ["Intercept", "a", "b", "x1", "x2"]) t_terms("b + a + x2 + x1", ["Intercept", "b", "a", "x2", "x1"]) t_terms("0 + x1 + a + x2 + b + 1", ["Intercept", "a", "b", "x1", "x2"]) t_terms("0 + a:b + a + b + 1", ["Intercept", "a", "b", "a:b"]) t_terms("a + a:x1 + x2 + x1 + b", ["Intercept", "a", "b", "x1", "a:x1", "x2"]) t_terms( "0 + a:x1:x2 + a + x2:x1:b + x2 + x1 + a:x1 + x1:x2 + x1:a:x2:a:b", ["a", "x1:x2", "a:x1:x2", "x2:x1:b", "x1:a:x2:b", "x2", "x1", "a:x1"], )