コード例 #1
0
ファイル: test_highlevel.py プロジェクト: gyenney/Tools
def test_term_info():
    data = balanced(a=2, b=2)
    rhs = dmatrix("a:b", data)
    assert rhs.design_info.column_names == ["Intercept", "b[T.b2]", "a[T.a2]:b[b1]", "a[T.a2]:b[b2]"]
    assert rhs.design_info.term_names == ["Intercept", "a:b"]
    assert len(rhs.design_info.terms) == 2
    assert rhs.design_info.terms[0] == INTERCEPT
コード例 #2
0
def test_categorical():
    data = balanced(a=2, b=2)
    # There are more exhaustive tests for all the different coding options in
    # test_build; let's just make sure that C() and stuff works.
    t("~ C(a)", data, 0, True, [[1, 0], [1, 0], [1, 1], [1, 1]],
      ["Intercept", "C(a)[T.a2]"])
    t("~ C(a, levels=['a2', 'a1'])", data, 0, True,
      [[1, 1], [1, 1], [1, 0], [1, 0]],
      ["Intercept", "C(a, levels=['a2', 'a1'])[T.a1]"])
    t("~ C(a, Treatment(reference=-1))", data, 0, True,
      [[1, 1], [1, 1], [1, 0], [1, 0]],
      ["Intercept", "C(a, Treatment(reference=-1))[T.a1]"])

    # Different interactions
    t("a*b", data, 0, True,
      [[1, 0, 0, 0], [1, 0, 1, 0], [1, 1, 0, 0], [1, 1, 1, 1]],
      ["Intercept", "a[T.a2]", "b[T.b2]", "a[T.a2]:b[T.b2]"])
    t("0 + a:b", data, 0, True,
      [[1, 0, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 0, 1]],
      ["a[a1]:b[b1]", "a[a2]:b[b1]", "a[a1]:b[b2]", "a[a2]:b[b2]"])
    t("1 + a + a:b", data, 0, True,
      [[1, 0, 0, 0], [1, 0, 1, 0], [1, 1, 0, 0], [1, 1, 0, 1]],
      ["Intercept", "a[T.a2]", "a[a1]:b[T.b2]", "a[a2]:b[T.b2]"])

    # Changing contrast with C()
    data["a"] = C(data["a"], Helmert)
    t("a", data, 0, True, [[1, -1], [1, -1], [1, 1], [1, 1]],
      ["Intercept", "a[H.a2]"])
    t("C(a, Treatment)", data, 0, True, [[1, 0], [1, 0], [1, 1], [1, 1]],
      ["Intercept", "C(a, Treatment)[T.a2]"])
    # That didn't affect the original object
    t("a", data, 0, True, [[1, -1], [1, -1], [1, 1], [1, 1]],
      ["Intercept", "a[H.a2]"])
コード例 #3
0
ファイル: test_build.py プロジェクト: pydata/patsy
def test_simple():
    data = balanced(a=2, b=2)
    x1 = data["x1"] = np.linspace(0, 1, len(data["a"]))
    x2 = data["x2"] = data["x1"] ** 2

    m = make_matrix(data, 2, [["a"]], column_names=["a[a1]", "a[a2]"])
    assert np.allclose(m, [[1, 0], [1, 0], [0, 1], [0, 1]])

    m = make_matrix(data, 2, [[], ["a"]], column_names=["Intercept", "a[T.a2]"])
    assert np.allclose(m, [[1, 0], [1, 0], [1, 1], [1, 1]])

    m = make_matrix(data, 4, [["a", "b"]], column_names=["a[a1]:b[b1]", "a[a2]:b[b1]", "a[a1]:b[b2]", "a[a2]:b[b2]"])
    assert np.allclose(m, [[1, 0, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 0, 1]])

    m = make_matrix(
        data, 4, [[], ["a"], ["b"], ["a", "b"]], column_names=["Intercept", "a[T.a2]", "b[T.b2]", "a[T.a2]:b[T.b2]"]
    )
    assert np.allclose(m, [[1, 0, 0, 0], [1, 0, 1, 0], [1, 1, 0, 0], [1, 1, 1, 1]])

    m = make_matrix(
        data, 4, [[], ["b"], ["a"], ["b", "a"]], column_names=["Intercept", "b[T.b2]", "a[T.a2]", "b[T.b2]:a[T.a2]"]
    )
    assert np.allclose(m, [[1, 0, 0, 0], [1, 1, 0, 0], [1, 0, 1, 0], [1, 1, 1, 1]])

    m = make_matrix(data, 4, [["a"], ["x1"], ["a", "x1"]], column_names=["a[a1]", "a[a2]", "x1", "a[T.a2]:x1"])
    assert np.allclose(m, [[1, 0, x1[0], 0], [1, 0, x1[1], 0], [0, 1, x1[2], x1[2]], [0, 1, x1[3], x1[3]]])

    m = make_matrix(data, 3, [["x1"], ["x2"], ["x2", "x1"]], column_names=["x1", "x2", "x2:x1"])
    assert np.allclose(m, np.column_stack((x1, x2, x1 * x2)))
コード例 #4
0
def test_term_info():
    data = balanced(a=2, b=2)
    rhs = dmatrix("a:b", data)
    assert rhs.design_info.column_names == ["Intercept", "b[T.b2]",
                                            "a[T.a2]:b[b1]", "a[T.a2]:b[b2]"]
    assert rhs.design_info.term_names == ["Intercept", "a:b"]
    assert len(rhs.design_info.terms) == 2
    assert rhs.design_info.terms[0] == INTERCEPT
コード例 #5
0
ファイル: test_build.py プロジェクト: pydata/patsy
def test_redundancy_thoroughly():
    # To make sure there aren't any lurking bugs analogous to the ones that R
    # has (see above), we check that we get the correct matrix rank for every
    # possible combination of 2 categorical and 2 numerical factors.
    data = balanced(a=2, b=2, repeat=5)
    data["x1"] = np.linspace(0, 1, len(data["a"]))
    data["x2"] = data["x1"] ** 2

    def all_subsets(l):
        if not l:
            yield tuple()
        else:
            obj = l[0]
            for subset in all_subsets(l[1:]):
                yield tuple(sorted(subset))
                yield tuple(sorted((obj,) + subset))

    all_terms = list(all_subsets(("a", "b", "x1", "x2")))
    all_termlist_templates = list(all_subsets(all_terms))
    print(len(all_termlist_templates))
    # eliminate some of the symmetric versions to speed things up
    redundant = [
        [("b",), ("a",)],
        [("x2",), ("x1",)],
        [("b", "x2"), ("a", "x1")],
        [("a", "b", "x2"), ("a", "b", "x1")],
        [("b", "x1", "x2"), ("a", "x1", "x2")],
    ]
    count = 0
    import time

    start = time.time()
    for termlist_template in all_termlist_templates:
        termlist_set = set(termlist_template)
        for dispreferred, preferred in redundant:
            if dispreferred in termlist_set and preferred not in termlist_set:
                break
        else:
            expanded_terms = set()
            for term_template in termlist_template:
                numeric = tuple([t for t in term_template if t.startswith("x")])
                rest = [t for t in term_template if not t.startswith("x")]
                for subset_rest in all_subsets(rest):
                    expanded_terms.add(frozenset(subset_rest + numeric))
            # Because our categorical variables have 2 levels, each expanded
            # term corresponds to 1 unique dimension of variation
            expected_rank = len(expanded_terms)
            if termlist_template in [(), ((),)]:
                # No data dependence, should fail
                assert_raises(PatsyError, make_matrix, data, expected_rank, termlist_template)
            else:
                make_matrix(data, expected_rank, termlist_template)
            count += 1
            if count % 100 == 0:
                print("Completed:", count)
    print("Took %0.2f seconds" % (time.time() - start,))
コード例 #6
0
def test_redundancy_thoroughly():
    # To make sure there aren't any lurking bugs analogous to the ones that R
    # has (see above), we check that we get the correct matrix rank for every
    # possible combination of 2 categorical and 2 numerical factors.
    data = balanced(a=2, b=2, repeat=5)
    data["x1"] = np.linspace(0, 1, len(data["a"]))
    data["x2"] = data["x1"] ** 2

    def all_subsets(l):
        if not l:
            yield tuple()
        else:
            obj = l[0]
            for subset in all_subsets(l[1:]):
                yield tuple(sorted(subset))
                yield tuple(sorted((obj,) + subset))

    all_terms = list(all_subsets(("a", "b", "x1", "x2")))
    all_termlist_templates = list(all_subsets(all_terms))
    print(len(all_termlist_templates))
    # eliminate some of the symmetric versions to speed things up
    redundant = [[("b",), ("a",)],
                 [("x2",), ("x1",)],
                 [("b", "x2"), ("a", "x1")],
                 [("a", "b", "x2"), ("a", "b", "x1")],
                 [("b", "x1", "x2"), ("a", "x1", "x2")]]
    count = 0
    import time
    start = time.time()
    for termlist_template in all_termlist_templates:
        termlist_set = set(termlist_template)
        for dispreferred, preferred in redundant:
            if dispreferred in termlist_set and preferred not in termlist_set:
                break
        else:
            expanded_terms = set()
            for term_template in termlist_template:
                numeric = tuple([t for t in term_template if t.startswith("x")])
                rest = [t for t in term_template if not t.startswith("x")]
                for subset_rest in all_subsets(rest):
                    expanded_terms.add(frozenset(subset_rest + numeric))
            # Because our categorical variables have 2 levels, each expanded
            # term corresponds to 1 unique dimension of variation
            expected_rank = len(expanded_terms)
            if termlist_template in [(), ((),)]:
                # No data dependence, should fail
                assert_raises(PatsyError,
                              make_matrix,
                              data, expected_rank, termlist_template)
            else:
                make_matrix(data, expected_rank, termlist_template)
            count += 1
            if count % 100 == 0:
                print("Completed:", count)
    print("Took %0.2f seconds" % (time.time() - start,))
コード例 #7
0
ファイル: test_highlevel.py プロジェクト: gyenney/Tools
def test_categorical():
    data = balanced(a=2, b=2)
    # There are more exhaustive tests for all the different coding options in
    # test_build; let's just make sure that C() and stuff works.
    t("~ C(a)", data, 0, True, [[1, 0], [1, 0], [1, 1], [1, 1]], ["Intercept", "C(a)[T.a2]"])
    t(
        "~ C(a, levels=['a2', 'a1'])",
        data,
        0,
        True,
        [[1, 1], [1, 1], [1, 0], [1, 0]],
        ["Intercept", "C(a, levels=['a2', 'a1'])[T.a1]"],
    )
    t(
        "~ C(a, Treatment(reference=-1))",
        data,
        0,
        True,
        [[1, 1], [1, 1], [1, 0], [1, 0]],
        ["Intercept", "C(a, Treatment(reference=-1))[T.a1]"],
    )

    # Different interactions
    t(
        "a*b",
        data,
        0,
        True,
        [[1, 0, 0, 0], [1, 0, 1, 0], [1, 1, 0, 0], [1, 1, 1, 1]],
        ["Intercept", "a[T.a2]", "b[T.b2]", "a[T.a2]:b[T.b2]"],
    )
    t(
        "0 + a:b",
        data,
        0,
        True,
        [[1, 0, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 0, 1]],
        ["a[a1]:b[b1]", "a[a2]:b[b1]", "a[a1]:b[b2]", "a[a2]:b[b2]"],
    )
    t(
        "1 + a + a:b",
        data,
        0,
        True,
        [[1, 0, 0, 0], [1, 0, 1, 0], [1, 1, 0, 0], [1, 1, 0, 1]],
        ["Intercept", "a[T.a2]", "a[a1]:b[T.b2]", "a[a2]:b[T.b2]"],
    )

    # Changing contrast with C()
    data["a"] = C(data["a"], Helmert)
    t("a", data, 0, True, [[1, -1], [1, -1], [1, 1], [1, 1]], ["Intercept", "a[H.a2]"])
    t("C(a, Treatment)", data, 0, True, [[1, 0], [1, 0], [1, 1], [1, 1]], ["Intercept", "C(a, Treatment)[T.a2]"])
    # That didn't affect the original object
    t("a", data, 0, True, [[1, -1], [1, -1], [1, 1], [1, 1]], ["Intercept", "a[H.a2]"])
コード例 #8
0
def test_simple():
    data = balanced(a=2, b=2)
    x1 = data["x1"] = np.linspace(0, 1, len(data["a"]))
    x2 = data["x2"] = data["x1"]**2

    m = make_matrix(data, 2, [["a"]], column_names=["a[a1]", "a[a2]"])
    assert np.allclose(m, [[1, 0], [1, 0], [0, 1], [0, 1]])

    m = make_matrix(data,
                    2, [[], ["a"]],
                    column_names=["Intercept", "a[T.a2]"])
    assert np.allclose(m, [[1, 0], [1, 0], [1, 1], [1, 1]])

    m = make_matrix(data,
                    4, [["a", "b"]],
                    column_names=[
                        "a[a1]:b[b1]", "a[a2]:b[b1]", "a[a1]:b[b2]",
                        "a[a2]:b[b2]"
                    ])
    assert np.allclose(
        m, [[1, 0, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 0, 1]])

    m = make_matrix(
        data,
        4, [[], ["a"], ["b"], ["a", "b"]],
        column_names=["Intercept", "a[T.a2]", "b[T.b2]", "a[T.a2]:b[T.b2]"])
    assert np.allclose(
        m, [[1, 0, 0, 0], [1, 0, 1, 0], [1, 1, 0, 0], [1, 1, 1, 1]])

    m = make_matrix(
        data,
        4, [[], ["b"], ["a"], ["b", "a"]],
        column_names=["Intercept", "b[T.b2]", "a[T.a2]", "b[T.b2]:a[T.a2]"])
    assert np.allclose(
        m, [[1, 0, 0, 0], [1, 1, 0, 0], [1, 0, 1, 0], [1, 1, 1, 1]])

    m = make_matrix(data,
                    4, [["a"], ["x1"], ["a", "x1"]],
                    column_names=["a[a1]", "a[a2]", "x1", "a[T.a2]:x1"])
    assert np.allclose(m, [[1, 0, x1[0], 0], [1, 0, x1[1], 0],
                           [0, 1, x1[2], x1[2]], [0, 1, x1[3], x1[3]]])

    m = make_matrix(data,
                    3, [["x1"], ["x2"], ["x2", "x1"]],
                    column_names=["x1", "x2", "x2:x1"])
    assert np.allclose(m, np.column_stack((x1, x2, x1 * x2)))
コード例 #9
0
def test_R_bugs():
    data = balanced(a=2, b=2, c=2)
    data["x"] = np.linspace(0, 1, len(data["a"]))
    # For "1 + a:b", R produces a design matrix with too many columns (5
    # instead of 4), because it can't tell that there is a redundancy between
    # the two terms.
    make_matrix(data, 4, [[], ["a", "b"]])
    # For "0 + a:x + a:b", R produces a design matrix with too few columns (4
    # instead of 6), because it thinks that there is a redundancy which
    # doesn't exist.
    make_matrix(data, 6, [["a", "x"], ["a", "b"]])
    # This can be compared with "0 + a:c + a:b", where the redundancy does
    # exist. Confusingly, adding another categorical factor increases the
    # baseline dimensionality to 8, and then the redundancy reduces it to 6
    # again, so the result is the same as before but for different reasons. (R
    # does get this one right, but we might as well test it.)
    make_matrix(data, 6, [["a", "c"], ["a", "b"]])
コード例 #10
0
ファイル: test_build.py プロジェクト: CaptainAL/Spyder
def test_R_bugs():
    data = balanced(a=2, b=2, c=2)
    data["x"] = np.linspace(0, 1, len(data["a"]))
    # For "1 + a:b", R produces a design matrix with too many columns (5
    # instead of 4), because it can't tell that there is a redundancy between
    # the two terms.
    make_matrix(data, 4, [[], ["a", "b"]])
    # For "0 + a:x + a:b", R produces a design matrix with too few columns (4
    # instead of 6), because it thinks that there is a redundancy which
    # doesn't exist.
    make_matrix(data, 6, [["a", "x"], ["a", "b"]])
    # This can be compared with "0 + a:c + a:b", where the redundancy does
    # exist. Confusingly, adding another categorical factor increases the
    # baseline dimensionality to 8, and then the redundancy reduces it to 6
    # again, so the result is the same as before but for different reasons. (R
    # does get this one right, but we might as well test it.)
    make_matrix(data, 6, [["a", "c"], ["a", "b"]])
コード例 #11
0
def test_term_order():
    data = balanced(a=2, b=2)
    data["x1"] = np.linspace(0, 1, 4)
    data["x2"] = data["x1"]**2

    def t_terms(formula, order):
        m = dmatrix(formula, data)
        assert m.design_info.term_names == order

    t_terms("a + b + x1 + x2", ["Intercept", "a", "b", "x1", "x2"])
    t_terms("b + a + x2 + x1", ["Intercept", "b", "a", "x2", "x1"])
    t_terms("0 + x1 + a + x2 + b + 1", ["Intercept", "a", "b", "x1", "x2"])
    t_terms("0 + a:b + a + b + 1", ["Intercept", "a", "b", "a:b"])
    t_terms("a + a:x1 + x2 + x1 + b",
            ["Intercept", "a", "b", "x1", "a:x1", "x2"])
    t_terms(
        "0 + a:x1:x2 + a + x2:x1:b + x2 + x1 + a:x1 + x1:x2 + x1:a:x2:a:b",
        ["a", "x1:x2", "a:x1:x2", "x2:x1:b", "x1:a:x2:b", "x2", "x1", "a:x1"])
コード例 #12
0
ファイル: test_highlevel.py プロジェクト: gyenney/Tools
def test_term_order():
    data = balanced(a=2, b=2)
    data["x1"] = np.linspace(0, 1, 4)
    data["x2"] = data["x1"] ** 2

    def t_terms(formula, order):
        m = dmatrix(formula, data)
        assert m.design_info.term_names == order

    t_terms("a + b + x1 + x2", ["Intercept", "a", "b", "x1", "x2"])
    t_terms("b + a + x2 + x1", ["Intercept", "b", "a", "x2", "x1"])
    t_terms("0 + x1 + a + x2 + b + 1", ["Intercept", "a", "b", "x1", "x2"])
    t_terms("0 + a:b + a + b + 1", ["Intercept", "a", "b", "a:b"])
    t_terms("a + a:x1 + x2 + x1 + b", ["Intercept", "a", "b", "x1", "a:x1", "x2"])
    t_terms(
        "0 + a:x1:x2 + a + x2:x1:b + x2 + x1 + a:x1 + x1:x2 + x1:a:x2:a:b",
        ["a", "x1:x2", "a:x1:x2", "x2:x1:b", "x1:a:x2:b", "x2", "x1", "a:x1"],
    )