Python design_matricesの例、formulae.design_matrices.design_matrices Pythonの例

コード例 #1

0

ファイルを表示

def test_external_transforms(data):
    dm = design_matrices("y ~ np.exp(x1)", data)
    assert np.allclose(dm.common["np.exp(x1)"][:, 0], np.exp(data["x1"]))

    def add_ten(x):
        return x + 10

    dm = design_matrices("y ~ add_ten(x1)", data)
    assert np.allclose(dm.common["add_ten(x1)"][:, 0], data["x1"] + 10)

コード例 #2

0

ファイルを表示

def test_group_specific_intercept_only(data):
    dm = design_matrices("y ~ 0 + (1|g)", data)
    assert len(dm.group.terms_info) == 1
    assert dm.group.terms_info["1|g"]["type"] == "intercept"
    assert dm.group.terms_info["1|g"]["groups"] == ["A", "B"]
    assert dm.group.terms_info["1|g"]["full_names"] == ["1|g[A]", "1|g[B]"]
    assert dm.common == None

コード例 #3

0

ファイルを表示

def test_common_intercept_only_model(data):
    dm = design_matrices("y ~ 1", data)
    assert len(dm.common.terms_info) == 1
    assert dm.common.terms_info["Intercept"]["type"] == "intercept"
    assert dm.common.terms_info["Intercept"]["full_names"] == ["Intercept"]
    assert all(dm.common.design_matrix == 1)
    assert dm.group == None

コード例 #4

0

ファイルを表示

def test_categoric_encoding_with_numeric_interaction():
    np.random.seed(1234)
    size = 20
    data = pd.DataFrame({
        "y": np.random.uniform(size=size),
        "x1": np.random.uniform(size=size),
        "x2": np.random.uniform(size=size),
        "x3": [1, 2, 3, 4] * 5,
        "f": np.random.choice(["A", "B"], size=size),
        "g": np.random.choice(["A", "B"], size=size),
        "h": np.random.choice(["A", "B"], size=size),
        "j": np.random.choice(["A", "B"], size=size),
    })
    dm = design_matrices("y ~ x1 + x2 + f:g + h:j:x2", data)
    assert list(dm.common.terms_info.keys()) == [
        "Intercept", "x1", "x2", "g", "f:g", "j", "h:j:x2"
    ]
    assert dm.common.terms_info["g"]["encoding"] == "reduced"
    assert dm.common.terms_info["f:g"]["type"] == "interaction"
    assert dm.common.terms_info["f:g"]["terms"]["f"]["encoding"] == "reduced"
    assert dm.common.terms_info["f:g"]["terms"]["g"]["encoding"] == "full"
    assert dm.common.terms_info["f:g"]["full_names"] == [
        "f[B]:g[A]", "f[B]:g[B]"
    ]
    assert dm.common.terms_info["j"]["encoding"] == "reduced"
    assert dm.common.terms_info["h:j:x2"]["terms"]["h"][
        "encoding"] == "reduced"
    assert dm.common.terms_info["h:j:x2"]["terms"]["j"]["encoding"] == "full"
    assert dm.common.terms_info["h:j:x2"]["terms"]["x2"]["type"] == "numeric"

コード例 #5

0

ファイルを表示

def test_categoric_group_specific():
    data = pd.DataFrame({
        "BP": np.random.normal(size=30),
        "BMI": np.random.normal(size=30),
        "age_grp": np.random.choice([0, 1, 2], size=30),
    })
    dm = design_matrices("BP ~ 0 + (C(age_grp)|BMI)", data)
    list(dm.group.terms_info.keys()) == [
        "1|BMI", "C(age_grp)[1]|BMI", "C(age_grp)[2]|BMI"
    ]

    dm = design_matrices("BP ~ 0 + (0 + C(age_grp)|BMI)", data)
    list(dm.group.terms_info.keys()) == [
        "C(age_grp)[0]|BMI",
        "C(age_grp)[1]|BMI",
        "C(age_grp)[2]|BMI",
    ]

コード例 #6

0

ファイルを表示

def test_common_predictor(data):
    dm = design_matrices("y ~ x1", data)
    assert list(dm.common.terms_info.keys()) == ["Intercept", "x1"]
    assert dm.common.terms_info["x1"]["type"] == "numeric"
    assert dm.common.terms_info["x1"]["full_names"] == ["x1"]

    # uses alphabetic order
    # reference is the first value by default
    # reduced because we included intercept
    dm = design_matrices("y ~ f", data)
    assert list(dm.common.terms_info.keys()) == ["Intercept", "f"]
    assert dm.common.terms_info["f"]["type"] == "categoric"
    assert dm.common.terms_info["f"]["levels"] == sorted(
        list(data["f"].unique()))
    assert dm.common.terms_info["f"]["reference"] == sorted(
        list(data["f"].unique()))[0]
    assert dm.common.terms_info["f"]["encoding"] == "reduced"
    assert dm.common.terms_info["f"]["full_names"] == [
        f"f[{l}]" for l in sorted(data["f"].unique())[1:]
    ]

コード例 #7

0

ファイルを表示

def test_non_syntactic_names():
    data = pd.DataFrame({
        "My response": np.random.normal(size=10),
        "$$#1@@": np.random.normal(size=10),
        "-- ! Hi there!": np.random.normal(size=10),
    })

    dm = design_matrices("`My response` ~ `$$#1@@`*`-- ! Hi there!`", data)
    assert list(dm.common.terms_info.keys()) == [
        "Intercept",
        "$$#1@@",
        "-- ! Hi there!",
        "$$#1@@:-- ! Hi there!",
    ]
    assert np.allclose(dm.common["$$#1@@"][:, 0], data["$$#1@@"])
    assert np.allclose(dm.common["-- ! Hi there!"][:, 0],
                       data["-- ! Hi there!"])
    assert np.allclose(dm.common["-- ! Hi there!"][:, 0],
                       data["-- ! Hi there!"])
    assert np.allclose(dm.common["$$#1@@:-- ! Hi there!"][:, 0],
                       data["$$#1@@"] * data["-- ! Hi there!"])

コード例 #8

0

ファイルを表示

def test_categoric_encoding(data):
    # No intercept, one categoric predictor
    dm = design_matrices("y ~ 0 + f", data)
    assert list(dm.common.terms_info.keys()) == ["f"]
    assert dm.common.terms_info["f"]["type"] == "categoric"
    assert dm.common.terms_info["f"]["levels"] == sorted(
        list(data["f"].unique()))
    assert dm.common.terms_info["f"]["reference"] == sorted(
        list(data["f"].unique()))[0]
    assert dm.common.terms_info["f"]["encoding"] == "full"
    assert dm.common.terms_info["f"]["full_names"] == [
        f"f[{l}]" for l in sorted(data["f"].unique())
    ]
    assert dm.common.design_matrix.shape == (20, 2)

    # Intercept, one categoric predictor
    dm = design_matrices("y ~ 1 + f", data)
    assert list(dm.common.terms_info.keys()) == ["Intercept", "f"]
    assert dm.common.terms_info["f"]["type"] == "categoric"
    assert dm.common.terms_info["f"]["levels"] == sorted(
        list(data["f"].unique()))
    assert dm.common.terms_info["f"]["reference"] == sorted(
        list(data["f"].unique()))[0]
    assert dm.common.terms_info["f"]["encoding"] == "reduced"
    assert dm.common.terms_info["f"]["full_names"] == [
        f"f[{l}]" for l in sorted(data["f"].unique())[1:]
    ]
    assert dm.common.design_matrix.shape == (20, 2)

    # No intercept, two additive categoric predictors
    dm = design_matrices("y ~ 0 + f + g", data)
    assert list(dm.common.terms_info.keys()) == ["f", "g"]
    assert dm.common.terms_info["f"]["type"] == "categoric"
    assert dm.common.terms_info["g"]["type"] == "categoric"
    assert dm.common.terms_info["f"]["levels"] == sorted(
        list(data["f"].unique()))
    assert dm.common.terms_info["g"]["levels"] == sorted(
        list(data["g"].unique()))
    assert dm.common.terms_info["f"]["reference"] == sorted(
        list(data["f"].unique()))[0]
    assert dm.common.terms_info["g"]["reference"] == sorted(
        list(data["g"].unique()))[0]
    assert dm.common.terms_info["f"]["encoding"] == "full"
    assert dm.common.terms_info["g"]["encoding"] == "reduced"
    assert dm.common.terms_info["f"]["full_names"] == [
        f"f[{l}]" for l in sorted(data["f"].unique())
    ]
    assert dm.common.terms_info["g"]["full_names"] == [
        f"g[{l}]" for l in sorted(data["g"].unique())[1:]
    ]
    assert dm.common.design_matrix.shape == (20, 3)

    # Intercept, two additive categoric predictors
    dm = design_matrices("y ~ 1 + f + g", data)
    assert list(dm.common.terms_info.keys()) == ["Intercept", "f", "g"]
    assert dm.common.terms_info["f"]["type"] == "categoric"
    assert dm.common.terms_info["g"]["type"] == "categoric"
    assert dm.common.terms_info["f"]["levels"] == sorted(
        list(data["f"].unique()))
    assert dm.common.terms_info["g"]["levels"] == sorted(
        list(data["g"].unique()))
    assert dm.common.terms_info["f"]["reference"] == sorted(
        list(data["f"].unique()))[0]
    assert dm.common.terms_info["g"]["reference"] == sorted(
        list(data["g"].unique()))[0]
    assert dm.common.terms_info["f"]["encoding"] == "reduced"
    assert dm.common.terms_info["g"]["encoding"] == "reduced"
    assert dm.common.terms_info["f"]["full_names"] == [
        f"f[{l}]" for l in sorted(data["f"].unique())[1:]
    ]
    assert dm.common.terms_info["g"]["full_names"] == [
        f"g[{l}]" for l in sorted(data["g"].unique())[1:]
    ]
    assert dm.common.design_matrix.shape == (20, 3)

    # No intercept, two categoric predictors with interaction
    dm = design_matrices("y ~ 0 + f + g + f:g", data)
    assert list(dm.common.terms_info.keys()) == ["f", "g", "f:g"]
    assert dm.common.terms_info["f"]["type"] == "categoric"
    assert dm.common.terms_info["g"]["type"] == "categoric"
    assert dm.common.terms_info["f:g"]["type"] == "interaction"
    assert dm.common.terms_info["f"]["levels"] == sorted(
        list(data["f"].unique()))
    assert dm.common.terms_info["g"]["levels"] == sorted(
        list(data["g"].unique()))
    assert dm.common.terms_info["f"]["reference"] == sorted(
        list(data["f"].unique()))[0]
    assert dm.common.terms_info["g"]["reference"] == sorted(
        list(data["g"].unique()))[0]
    assert dm.common.terms_info["f"]["encoding"] == "full"
    assert dm.common.terms_info["g"]["encoding"] == "reduced"
    assert dm.common.terms_info["f:g"]["terms"]["f"]["encoding"] == "reduced"
    assert dm.common.terms_info["f:g"]["terms"]["g"]["encoding"] == "reduced"
    assert dm.common.terms_info["f"]["full_names"] == [
        f"f[{l}]" for l in sorted(data["f"].unique())
    ]
    assert dm.common.terms_info["g"]["full_names"] == [
        f"g[{l}]" for l in sorted(data["g"].unique())[1:]
    ]
    assert dm.common.terms_info["f:g"]["full_names"] == ["f[B]:g[B]"]
    assert dm.common.design_matrix.shape == (20, 4)

    # Intercept, two categoric predictors with interaction
    dm = design_matrices("y ~ 1 + f + g + f:g", data)
    assert list(dm.common.terms_info.keys()) == ["Intercept", "f", "g", "f:g"]
    assert dm.common.terms_info["f"]["type"] == "categoric"
    assert dm.common.terms_info["g"]["type"] == "categoric"
    assert dm.common.terms_info["f:g"]["type"] == "interaction"
    assert dm.common.terms_info["f"]["levels"] == sorted(
        list(data["f"].unique()))
    assert dm.common.terms_info["g"]["levels"] == sorted(
        list(data["g"].unique()))
    assert dm.common.terms_info["f"]["reference"] == sorted(
        list(data["f"].unique()))[0]
    assert dm.common.terms_info["g"]["reference"] == sorted(
        list(data["g"].unique()))[0]
    assert dm.common.terms_info["f"]["encoding"] == "reduced"
    assert dm.common.terms_info["g"]["encoding"] == "reduced"
    assert dm.common.terms_info["f:g"]["terms"]["f"]["encoding"] == "reduced"
    assert dm.common.terms_info["f:g"]["terms"]["g"]["encoding"] == "reduced"
    assert dm.common.terms_info["f"]["full_names"] == [
        f"f[{l}]" for l in sorted(data["f"].unique())[1:]
    ]
    assert dm.common.terms_info["g"]["full_names"] == [
        f"g[{l}]" for l in sorted(data["g"].unique())[1:]
    ]
    assert dm.common.terms_info["f:g"]["full_names"] == ["f[B]:g[B]"]
    assert dm.common.design_matrix.shape == (20, 4)

    # No intercept, interaction between two categorics
    dm = design_matrices("y ~ 0 + f:g", data)
    assert list(dm.common.terms_info.keys()) == ["f:g"]
    assert dm.common.terms_info["f:g"]["type"] == "interaction"
    assert dm.common.terms_info["f:g"]["terms"]["f"]["encoding"] == "full"
    assert dm.common.terms_info["f:g"]["terms"]["g"]["encoding"] == "full"
    assert dm.common.terms_info["f:g"]["full_names"] == [
        "f[A]:g[A]",
        "f[A]:g[B]",
        "f[B]:g[A]",
        "f[B]:g[B]",
    ]
    assert dm.common.design_matrix.shape == (20, 4)

    # Intercept, interaction between two categorics
    # It adds "g" -> It uses Patsy algorithm... look there if you're curious.
    dm = design_matrices("y ~ 1 + f:g", data)
    assert list(dm.common.terms_info.keys()) == ["Intercept", "g", "f:g"]
    assert dm.common.terms_info["g"]["encoding"] == "reduced"
    assert dm.common.terms_info["f:g"]["type"] == "interaction"
    assert dm.common.terms_info["f:g"]["terms"]["f"]["encoding"] == "reduced"
    assert dm.common.terms_info["f:g"]["terms"]["g"]["encoding"] == "full"
    assert dm.common.terms_info["f:g"]["full_names"] == [
        "f[B]:g[A]", "f[B]:g[B]"
    ]
    assert dm.common.design_matrix.shape == (20, 4)

    # Same than before
    dm = design_matrices("y ~ 1 + g + f:g", data)
    assert list(dm.common.terms_info.keys()) == ["Intercept", "g", "f:g"]
    assert dm.common.terms_info["g"]["encoding"] == "reduced"
    assert dm.common.terms_info["f:g"]["type"] == "interaction"
    assert dm.common.terms_info["f:g"]["terms"]["f"]["encoding"] == "reduced"
    assert dm.common.terms_info["f:g"]["terms"]["g"]["encoding"] == "full"
    assert dm.common.terms_info["f:g"]["full_names"] == [
        "f[B]:g[A]", "f[B]:g[B]"
    ]
    assert dm.common.design_matrix.shape == (20, 4)

コード例 #9

0

ファイルを表示

def test_empty_model(data):
    dm = design_matrices("y ~ 0", data)
    assert dm.common == None
    assert dm.group == None

コード例 #10

0

ファイルを表示

def test_empty_formula(data):
    with pytest.raises(ValueError):
        design_matrices("", data)

コード例 #11

0

ファイルを表示

def test_built_in_transforms(data):
    # {...} gets translated to I(...)
    dm = design_matrices("y ~ {x1 + x2}", data)
    assert list(dm.common.terms_info.keys()) == ["Intercept", "I(x1 + x2)"]
    assert dm.common.terms_info["I(x1 + x2)"]["type"] == "numeric"
    assert np.allclose(dm.common["I(x1 + x2)"],
                       np.atleast_2d((data["x1"] + data["x2"]).to_numpy()).T)

    dm2 = design_matrices("y ~ I(x1 + x2)", data)
    assert compare_dicts(dm.common.terms_info, dm2.common.terms_info)

    # center()
    dm = design_matrices("y ~ center(x1)", data)
    assert list(dm.common.terms_info.keys()) == ["Intercept", "center(x1)"]
    assert dm.common.terms_info["center(x1)"]["type"] == "numeric"
    assert np.allclose(dm.common["center(x1)"].mean(), 0)

    # scale()
    dm = design_matrices("y ~ scale(x1)", data)
    assert list(dm.common.terms_info.keys()) == ["Intercept", "scale(x1)"]
    assert dm.common.terms_info["scale(x1)"]["type"] == "numeric"
    assert np.allclose(dm.common["scale(x1)"].mean(), 0)
    assert np.allclose(dm.common["scale(x1)"].std(), 1)

    # standardize(), alias of scale()
    dm = design_matrices("y ~ standardize(x1)", data)
    assert list(
        dm.common.terms_info.keys()) == ["Intercept", "standardize(x1)"]
    assert dm.common.terms_info["standardize(x1)"]["type"] == "numeric"
    assert np.allclose(dm.common["standardize(x1)"].mean(), 0)
    assert np.allclose(dm.common["standardize(x1)"].std(), 1)

    # C()
    # Intercept, no extra arguments, reference is first value observed
    dm = design_matrices("y ~ C(x3)", data)
    assert list(dm.common.terms_info.keys()) == ["Intercept", "C(x3)"]
    assert dm.common.terms_info["C(x3)"]["type"] == "categoric"
    assert dm.common.terms_info["C(x3)"]["encoding"] == "reduced"
    assert dm.common.terms_info["C(x3)"]["reference"] == 1
    assert dm.common.terms_info["C(x3)"]["levels"] == [1, 2, 3, 4]
    assert dm.common.terms_info["C(x3)"]["full_names"] == [
        "C(x3)[2]", "C(x3)[3]", "C(x3)[4]"
    ]

    # No intercept, no extra arguments
    dm = design_matrices("y ~ 0 + C(x3)", data)
    assert list(dm.common.terms_info.keys()) == ["C(x3)"]
    assert dm.common.terms_info["C(x3)"]["type"] == "categoric"
    assert dm.common.terms_info["C(x3)"]["encoding"] == "full"
    assert dm.common.terms_info["C(x3)"]["reference"] == 1
    assert dm.common.terms_info["C(x3)"]["levels"] == [1, 2, 3, 4]
    assert dm.common.terms_info["C(x3)"]["full_names"] == [
        "C(x3)[1]",
        "C(x3)[2]",
        "C(x3)[3]",
        "C(x3)[4]",
    ]

    # Specify reference -> it is converted into 0-1 variable
    dm = design_matrices("y ~ C(x3, 3)", data)
    assert dm.common.terms_info["C(x3, 3)"]["type"] == "numeric"
    assert dm.common.terms_info["C(x3, 3)"]["full_names"] == ["C(x3, 3)"]
    assert all(dm.common["C(x3, 3)"][:, 0] == np.where(data["x3"] == 3, 1, 0))

    # Specify levels, different to observed
    lvls = [3, 2, 4, 1]
    dm = design_matrices("y ~ C(x3, levels=lvls)", data)
    assert dm.common.terms_info["C(x3, levels = lvls)"]["type"] == "categoric"
    assert dm.common.terms_info["C(x3, levels = lvls)"]["reference"] == 3
    assert dm.common.terms_info["C(x3, levels = lvls)"]["levels"] == lvls

    # Pass a reference not in the data
    with pytest.raises(ValueError):
        dm = design_matrices("y ~ C(x3, 5)", data)

    # Pass categoric, remains unchanged
    dm = design_matrices("y ~ C(f)", data)
    dm2 = design_matrices("y ~ f", data)
    d1 = dm.common.terms_info["C(f)"]
    d2 = dm2.common.terms_info["f"]
    assert d1["type"] == d2["type"]
    assert d1["levels"] == d2["levels"]
    assert d1["reference"] == d2["reference"]
    assert d1["encoding"] == d2["encoding"]
    assert not d1["full_names"] == d2[
        "full_names"]  # because one is 'C(f)' and other is 'f'
    assert all(dm.common["C(f)"] == dm2.common["f"])

コード例 #12

0

ファイルを表示

def test_interactions(data):
    # These two models are the same
    dm = design_matrices("y ~ f * g", data)
    dm2 = design_matrices("y ~ f + g + f:g", data)
    assert compare_dicts(dm2.common.terms_info, dm.common.terms_info)

    # When no intercept too
    dm = design_matrices("y ~ 0 + f * g", data)
    dm2 = design_matrices("y ~ 0 + f + g + f:g", data)
    assert compare_dicts(dm2.common.terms_info, dm.common.terms_info)

    # Mix of numeric/categoric
    # "g" in "g" -> reduced
    # "g" in "x1:g" -> reduced because x1 is present in formula
    dm = design_matrices("y ~ x1 + g + x1:g", data)
    assert list(
        dm.common.terms_info.keys()) == ["Intercept", "x1", "g", "x1:g"]
    assert dm.common.terms_info["g"]["type"] == "categoric"
    assert dm.common.terms_info["g"]["encoding"] == "reduced"
    assert dm.common.terms_info["x1:g"]["terms"]["g"]["encoding"] == "reduced"

    # "g" in "g" -> reduced
    # "g" in "x1:g" -> full because x1 is not present in formula
    dm = design_matrices("y ~ g + x1:g", data)
    assert list(dm.common.terms_info.keys()) == ["Intercept", "g", "x1:g"]
    assert dm.common.terms_info["g"]["type"] == "categoric"
    assert dm.common.terms_info["g"]["encoding"] == "reduced"
    assert dm.common.terms_info["x1:g"]["terms"]["g"]["encoding"] == "full"

    # "g" in "x1:x2:g" is full, because x1:x2 is a new group and we don't have x1:x2 in the model
    dm = design_matrices("y ~ x1 + g + x1:g + x1:x2:g", data)
    assert list(dm.common.terms_info.keys()) == [
        "Intercept", "x1", "g", "x1:g", "x1:x2:g"
    ]
    assert dm.common.terms_info["g"]["type"] == "categoric"
    assert dm.common.terms_info["g"]["encoding"] == "reduced"
    assert dm.common.terms_info["x1:g"]["terms"]["g"]["encoding"] == "reduced"
    assert dm.common.terms_info["x1:x2:g"]["terms"]["g"]["encoding"] == "full"

    # "g" in "x1:x2:g" is reduced, because x1:x2 is a new group and we have x1:x2 in the model
    dm = design_matrices("y ~ x1 + g + x1:x2 + x1:g + x1:x2:g", data)
    assert list(dm.common.terms_info.keys()) == [
        "Intercept", "x1", "g", "x1:x2", "x1:g", "x1:x2:g"
    ]
    assert dm.common.terms_info["g"]["type"] == "categoric"
    assert dm.common.terms_info["g"]["encoding"] == "reduced"
    assert dm.common.terms_info["x1:g"]["terms"]["g"]["encoding"] == "reduced"
    assert dm.common.terms_info["x1:x2:g"]["terms"]["g"][
        "encoding"] == "reduced"

    # And now, since we don't have intercept, x1 and x1:x2 all "g" are full
    dm = design_matrices("y ~ 0 + g + x1:g + x1:x2:g", data)
    assert list(dm.common.terms_info.keys()) == ["g", "x1:g", "x1:x2:g"]
    assert dm.common.terms_info["g"]["type"] == "categoric"
    assert dm.common.terms_info["g"]["encoding"] == "full"
    assert dm.common.terms_info["x1:g"]["terms"]["g"]["encoding"] == "full"
    assert dm.common.terms_info["x1:x2:g"]["terms"]["g"]["encoding"] == "full"

    # Two numerics
    dm = design_matrices("y ~ x1:x2", data)
    assert "x1:x2" in dm.common.terms_info.keys()
    assert np.allclose(dm.common["x1:x2"][:, 0], data["x1"] * data["x2"])