def test_product_with_frequency_table_discrete_distribution(): freq_table1 = FrequencyTable({"A": 3, "B": 4, "C": 7}, name="Y1") dist1 = DiscreteDistribution(sample_1, names=["X1", "X2", "X3", "X4"]) # without common names dist3 = dist1 * freq_table1 assert all(compare(dist3.names, ["X1", "X2", "X3", "X4", "Y1"])) assert dist3.total == (dist1.total * freq_table1.total) # check probabilites assert dist3.frequency(("a", "y", 2, 33, "B")) == 6 * 4 assert dist3.probability(("a", "y", 2, 33, "B")) == 24 / 1708 assert dist3[("a", "y", 2, 33, "B")] == 24 dist3 = freq_table1 * dist1 assert all(compare(dist3.names, ["Y1", "X1", "X2", "X3", "X4"])) assert dist3.total == (dist1.total * freq_table1.total) # check probabilites assert dist3.frequency(("B", "a", "y", 2, 33)) == 6 * 4 assert dist3.probability(("B", "a", "y", 2, 33)) == 24 / 1708 assert dist3[("B", "a", "y", 2, 33)] == 24 # with common names freq_table1 = FrequencyTable({"x": 3, "y": 4}, name="X2") dist1 = DiscreteDistribution(sample_1, names=["X1", "X2", "X3", "X4"]) dist3 = dist1 * freq_table1 assert all(compare(dist3.names, ["X1", "X2", "X3", "X4"])) assert dist3.total == 52 * 3 + 70 * 4 dist3 = freq_table1 * dist1 assert all(compare(dist3.names, ["X2", "X1", "X3", "X4"])) assert dist3.total == 52 * 3 + 70 * 4
def test_entropy(): # Binary distribution from samples # This is 50-50 samples with entropy = log2(2) samples = [1, 1, 2, 2, 1, 1, 2, 2] ft = FrequencyTable(samples) assert entropy(ft) == np.log2(2) dd = DiscreteDistribution(samples) assert entropy(dd) == np.log2(2) # This is 60-40 samples with entropy = 0.970950 samples = [ "Dog", "Dog", "Dog", "Dog", "Dog", "Dog", "Cat", "Cat", "Cat", "Cat" ] ft = FrequencyTable(samples) assert entropy(ft) == approx(0.970950) dd = DiscreteDistribution(samples) assert entropy(dd) == approx(0.970950) # Deterministic case samples = {"Dog": 10, "Cat": 0} ft = FrequencyTable(samples) assert entropy(ft) == 0 dd = DiscreteDistribution(samples) assert entropy(dd) == 0 # Multiple levels samples = {(1, 2): 150, (1, 3): 150, (2, 2): 300, (2, 3): 400} dd = DiscreteDistribution(samples) assert entropy(dd) == approx(1.8709505945)
def test_marginals_operator_discrete_distribution(): # Four levels dist. samples = { ("a", "x", 1, 33): 1, ("a", "x", 2, 33): 2, ("a", "x", 1, 44): 3, ("a", "x", 2, 44): 4, ("a", "y", 1, 33): 5, ("a", "y", 2, 33): 6, ("a", "y", 1, 44): 7, ("a", "y", 2, 44): 8, ("b", "x", 1, 33): 9, ("b", "x", 2, 33): 10, ("b", "x", 1, 44): 11, ("b", "x", 2, 44): 12, ("b", "y", 1, 33): 13, ("b", "y", 2, 33): 14, ("b", "y", 1, 44): 15, ("b", "y", 2, 44): 16, } disc_dist = DiscreteDistribution(samples) assert (disc_dist << "X2").total == disc_dist.total assert (disc_dist << ("X2", "X3")).total == disc_dist.total assert (disc_dist << ("X2", "X3", "X4")).total == disc_dist.total assert all( compare((disc_dist << ("X1", "X2", "X4")).keys_as_list(), [1, 2])) assert all( compare((disc_dist << ("X1", "X2", "X3")).keys_as_list(), [33, 44])) assert all( compare((disc_dist << ("X2", "X3", "X4")).keys_as_list(), ["a", "b"])) assert all( compare( (disc_dist << ("X2", "X3")).keys_as_list(), [("a", 33), ("a", 44), ("b", 33), ("b", 44)], )) disc_dist = DiscreteDistribution(samples, names=["Age", "Sex", "Education", "City"]) assert (disc_dist << ("Age")).total == disc_dist.total assert (disc_dist << ("Sex", "Education")).total == disc_dist.total assert (disc_dist << ("Sex", "Education", "City")).total == disc_dist.total assert all( compare((disc_dist << ("Age", "Sex", "City")).keys_as_list(), [1, 2])) assert all( compare((disc_dist << ("Age", "Sex", "Education")).keys_as_list(), [33, 44])) assert all( compare((disc_dist << ("Sex", "Education", "City")).keys_as_list(), ["a", "b"])) assert all( compare( (disc_dist << ("Sex", "Education")).keys_as_list(), [("a", 33), ("a", 44), ("b", 33), ("b", 44)], ))
def test_iterable_samples_discrete_distribution(): samples = """It is a long established fact that a reader will be distracted by the readable content of a page when looking at its layout. The point of using Lorem Ipsum is that it has a more-or-less normal distribution of letters, as opposed to using 'Content here, content here', making it look like readable English.""" dist = DiscreteDistribution(iter(samples)) assert dist.total == len(samples) gen = (c for c in samples) dist = DiscreteDistribution(gen) assert dist.total == len(samples)
def test_conditional_discrete_distribution(): # Four levels dist. samples = { ("a", "x", 1, 33): 1, ("a", "x", 2, 33): 2, ("a", "x", 1, 44): 3, ("a", "x", 2, 44): 4, ("a", "y", 1, 33): 5, ("a", "y", 2, 33): 6, ("a", "y", 1, 44): 7, ("a", "y", 2, 44): 8, ("b", "x", 1, 33): 9, ("b", "x", 2, 33): 10, ("b", "x", 1, 44): 11, ("b", "x", 2, 44): 12, ("b", "y", 1, 33): 13, ("b", "y", 2, 33): 14, ("b", "y", 1, 44): 15, ("b", "y", 2, 44): 16, } disc_dist = DiscreteDistribution(samples) con_disc_dist = disc_dist.condition_on("X2") assert all(compare(con_disc_dist.conditional_rvs.names, ["X2"])) assert all( compare(con_disc_dist.distributions["x"].names, ["X1", "X3", "X4"])) assert all( compare(con_disc_dist.distributions["y"].names, ["X1", "X3", "X4"])) assert con_disc_dist.frequency(("a", 1, 33), "x") == 1 assert con_disc_dist.frequency(("a", 1, 33), "y") == 5 assert con_disc_dist.frequency(("a", 1, 44), "x") == 3 assert con_disc_dist.frequency(("a", 1, 44), "y") == 7 assert con_disc_dist.frequency(("b", 1, 33), "x") == 9 assert con_disc_dist.frequency(("b", 1, 33), "y") == 13 assert con_disc_dist.frequency(("b", 1, 44), "x") == 11 assert con_disc_dist.frequency(("b", 1, 44), "y") == 15 assert con_disc_dist.frequency(("b", 2, 44), "x") == 12 assert con_disc_dist.frequency(("b", 2, 33), "y") == 14 assert con_disc_dist.probability(("a", 1, 33), "x") == 1 / 52 assert con_disc_dist.probability(("a", 1, 33), "y") == 5 / 84 assert con_disc_dist.probability(("a", 1, 44), "x") == 3 / 52 assert con_disc_dist.probability(("a", 1, 44), "y") == 7 / 84 assert con_disc_dist.probability(("b", 1, 33), "x") == 9 / 52 assert con_disc_dist.probability(("b", 1, 33), "y") == 13 / 84 assert con_disc_dist.probability(("b", 1, 44), "x") == 11 / 52 assert con_disc_dist.probability(("b", 1, 44), "y") == 15 / 84 assert con_disc_dist.probability(("b", 2, 44), "x") == 12 / 52 assert con_disc_dist.probability(("b", 2, 33), "y") == 14 / 84
def test_levels_is_numeric_discrete_distribution(): samples = { ("a", "x", 1, 33, 1.5): 1, ("a", "x", 2, 33, 1.5): 2, ("a", "x", 1, 44, 1.5): 3, ("a", "x", 2, 44, 1.5): 4, ("a", "y", 1, 33, 1.5): 5, ("a", "y", 2, 33, 1.5): 6, ("a", "y", 1, 44, 2.5): 7, ("a", "y", 2, 44, 2.5): 8, ("b", "x", 1, 33, 2.5): 9, ("b", "x", 2, 33, 2.5): 10, ("b", "x", 1, 44, 2.5): 11, ("b", "x", 2, 44, 2.5): 12, ("b", "y", 1, 33, 3.5): 13, ("b", "y", 2, 33, 3.5): 14, ("b", "y", 1, 44, 3.5): 15, ("b", "y", 2, 44, 3.5): 16, } dist = DiscreteDistribution(samples) # by index assert not dist.rvs[0].is_numeric assert not dist.rvs[1].is_numeric assert dist.rvs[2].is_numeric assert dist.rvs[3].is_numeric assert dist.rvs[4].is_numeric # by name assert not dist.rvs["X1"].is_numeric assert not dist.rvs["X2"].is_numeric assert dist.rvs["X3"].is_numeric assert dist.rvs["X4"].is_numeric assert dist.rvs["X5"].is_numeric
def test_numpy_array_discrete_distribution(): # It is not numpy array with pytest.raises(ValueError): DiscreteDistribution.from_np_array({1, 2, 3}) # It is not list of list with pytest.raises(ValueError): DiscreteDistribution.from_np_array([1, 2, 3]) # list of list or numpy 2D array converts to tuples samples = np.r_[["A"] * 24, ["B"] * 48, ["C"] * 4, ["D"] * 7, ["E"] * 17] samples = samples.reshape((samples.shape[0], 1)) dist = DiscreteDistribution.from_np_array(samples) assert dist.total == 100 # It is important to use tuple as key # since list is not hashable assert dist.probability(("A", )) == 0.24 assert dist.prob(X1=("A", )) == 0.24
def test_product_with_two_common_vars_discrete_distribution(): dist1 = DiscreteDistribution(sample_1, names=["X1", "X2", "X3", "X4"]) dist2 = DiscreteDistribution(sample_2, names=["X3", "X5", "X6", "X2"]) dist3 = dist1 * dist2 assert all(compare(dist3.names, ["X1", "X2", "X3", "X4", "X5", "X6"])) assert dist3.total == 24 * 36 + 28 * 100 + 40 * 164 + 30 * 51 # check probabilites assert dist3.frequency(("a", "y", 2, 33, "high", "under")) == 6 * 25 assert dist3.probability(("a", "y", 2, 33, "high", "under")) == 150 / 11754 assert dist3[("a", "y", 2, 33, "high", "under")] == 150 # check the case that the right does not have the common assert dist3.frequency(("a", "y", 2, 33, "low", "under")) == 0 assert dist3.probability(("a", "y", 2, 33, "low", "under")) == 0 assert dist3[("a", "y", 2, 33, "low", "under")] == 0 # check the case that the left does not have the common assert dist3.frequency(("b", "y", 2, 33, "high", "under")) == 0 assert dist3.probability(("b", "y", 2, 33, "high", "under")) == 0 assert dist3[("b", "y", 2, 33, "high", "under")] == 0
def test_product_with_no_common_vars_discrete_distribution(): dist1 = DiscreteDistribution(sample_1, names=["X1", "X2", "X3", "X4"]) dist2 = DiscreteDistribution(sample_2, names=["Y1", "Y2", "Y3", "Y4"]) dist3 = dist1 * dist2 assert all( compare(dist3.names, ["X1", "X2", "X3", "X4", "Y1", "Y2", "Y3", "Y4"])) assert dist3.total == (dist1.total * dist2.total) # check probabilites assert dist3.frequency(("a", "x", 1, 33, 2, "high", "under", "x")) == 9 assert dist3.probability( ("a", "x", 1, 33, 2, "high", "normal", "x")) == 10 / 42822 assert dist3[("a", "x", 1, 33, 2, "high", "normal", "x")] == 10 assert dist3.frequency(("b", "x", 1, 44, 1, "low", "over", "y")) == 253 assert dist3.probability( ("b", "x", 1, 44, 1, "low", "over", "y")) == 253 / 42822 assert dist3[("b", "x", 1, 44, 1, "low", "over", "y")] == 253
def test_product_with_one_common_var_discrete_distribution(): dist1 = DiscreteDistribution(sample_1, names=["X1", "X2", "X3", "X4"]) dist2 = DiscreteDistribution(sample_2, names=["X3", "X5", "X6", "X7"]) dist3 = dist1 * dist2 assert all(compare(dist3.names, ["X1", "X2", "X3", "X4", "X5", "X6", "X7"])) assert dist3.total == (36 + 164) * 64 + (100 + 51) * 58 # check probabilites assert dist3.frequency(("a", "x", 1, 33, "high", "normal", "x")) == 2 assert dist3.probability( ("a", "x", 1, 33, "high", "normal", "x")) == 2 / 21558 assert dist3[("a", "x", 1, 33, "high", "normal", "x")] == 2 # check the case that the right does not have the common assert dist3.frequency(("b", "y", 2, 44, "high", "over", "y")) == 0 assert dist3.probability(("b", "y", 2, 44, "high", "over", "y")) == 0 assert dist3[("b", "y", 2, 44, "high", "over", "y")] == 0 # check the case that the left does not have the common assert dist3.frequency(("b", "y", 2, 33, "high", "normal", "y")) == 0 assert dist3.probability(("b", "y", 2, 33, "high", "normal", "y")) == 0 assert dist3[("b", "y", 2, 33, "high", "normal", "y")] == 0
def test_conditional_operator_discrete_distribution(): # Four levels dist. samples = { ("a", "x", 1, 33): 1, ("a", "x", 2, 33): 2, ("a", "x", 1, 44): 3, ("a", "x", 2, 44): 4, ("a", "y", 1, 33): 5, ("a", "y", 2, 33): 6, ("a", "y", 1, 44): 7, ("a", "y", 2, 44): 8, ("b", "x", 1, 33): 9, ("b", "x", 2, 33): 10, ("b", "x", 1, 44): 11, ("b", "x", 2, 44): 12, ("b", "y", 1, 33): 13, ("b", "y", 2, 33): 14, ("b", "y", 1, 44): 15, ("b", "y", 2, 44): 16, } disc_dist = DiscreteDistribution(samples) con_disc_dist = disc_dist | "X2" assert all(compare(con_disc_dist.conditional_rvs.names, ["X2"])) assert all( compare(con_disc_dist.distributions["x"].names, ["X1", "X3", "X4"])) assert all( compare(con_disc_dist.distributions["y"].names, ["X1", "X3", "X4"])) assert con_disc_dist.frequency(("a", 1, 33), "x") == 1 assert con_disc_dist.frequency(("a", 1, 33), "y") == 5 assert con_disc_dist.probability(("a", 1, 33), "x") == 1 / 52 assert con_disc_dist.probability(("a", 1, 33), "y") == 5 / 84 con_disc_dist = disc_dist | ("X2", "X3") assert all(compare(con_disc_dist.conditional_rvs.names, ["X2", "X3"])) assert all( compare(con_disc_dist.distributions[("x", 1)].names, ["X1", "X4"])) assert all( compare(con_disc_dist.distributions[("x", 2)].names, ["X1", "X4"])) assert all( compare(con_disc_dist.distributions[("y", 1)].names, ["X1", "X4"])) assert all( compare(con_disc_dist.distributions[("y", 2)].names, ["X1", "X4"])) assert con_disc_dist.frequency(("a", 33), ("x", 1)) == 1 assert con_disc_dist.probability(("a", 33), ("x", 1)) == 1 / 24
def test_three_levels_discrete_distribution(): dist = DiscreteDistribution({("A", "y", 1): 2}) both_levels = zip(dist.levels(), [["A"], ["y"], [1]]) for levels_1, levels_2 in both_levels: assert all(compare(levels_1, levels_2)) assert dist.rvs.size == 3 assert dist[("A", "y", 1)] == 2 assert dist[("A", "x", 2)] == 0 assert all(compare(dist.frequencies(normalised=True), [1])) assert all(compare(dist.frequencies(normalised=False), [2])) assert dist.prob(X1="A", X2="y", X3=1) == 1 assert dist.prob(X1="A", X2="y", X3=2) == 0 dist = DiscreteDistribution({("A", "x", 1): 2, ("A", "y", 1): 2}) both_levels = zip(dist.levels(), [["A"], ["x", "y"], [1]]) for levels_1, levels_2 in both_levels: assert all(compare(levels_1, levels_2)) assert dist.rvs.size == 3 assert dist[("A", "x", 1)] == 2 assert dist[("A", "y", 1)] == 2 assert all(compare(dist.frequencies(normalised=True), [0.5, 0.5])) assert all(compare(dist.frequencies(normalised=False), [2, 2])) assert dist.prob(X1="A", X2="x", X3=1) == 0.5 assert dist.prob(X1="A", X2="y", X3=1) == 0.5 assert dist.prob(X1="A", X2="y", X3=2) == 0 dist = DiscreteDistribution({("A", "x", 1): 2, ("B", "y", 2): 2}) both_levels = zip(dist.levels(), [["A", "B"], ["x", "y"], [1, 2]]) for levels_1, levels_2 in both_levels: assert all(compare(levels_1, levels_2)) assert dist.rvs.size == 3 assert dist[("A", "x", 1)] == 2 assert dist[("B", "y", 2)] == 2 assert dist[("B", "y", 3)] == 0 assert all(compare(dist.frequencies(normalised=True), [0.5, 0.5])) assert all(compare(dist.frequencies(normalised=False), [2, 2])) assert dist.prob(X1="A", X2="x", X3=1) == 0.5 assert dist.prob(X1="B", X2="y", X3=2) == 0.5 dist = DiscreteDistribution({ ("A", "x", 1): 1, ("A", "y", 2): 2, ("B", "x", 1): 3 }) both_levels = zip(dist.levels(), [["A", "B"], ["x", "y"], [1, 2]]) for levels_1, levels_2 in both_levels: assert all(compare(levels_1, levels_2)) assert dist.rvs.size == 3 assert dist[("A", "x", 1)] == 1 assert dist[("A", "y", 2)] == 2 assert dist[("B", "x", 1)] == 3 assert dist["B"] == 0 assert all( compare(dist.frequencies(normalised=True), [1 / 6, 2 / 6, 3 / 6])) assert all(compare(dist.frequencies(normalised=False), [1, 2, 3])) assert dist.prob(X1="A", X2="x", X3=1) == 1 / 6 assert dist.prob(X1="A", X2="y", X3=2) == 2 / 6 assert dist.prob(X1="B", X2="x", X3=1) == 3 / 6 assert dist.prob(X1="B", X2="y", X3=2) == 0 dist = DiscreteDistribution({ ("A", "x", 1): 1, ("A", "y", 2): 2, ("B", "x", 1): 3, ("B", "y", 2): 4 }) both_levels = zip(dist.levels(), [["A", "B"], ["x", "y"], [1, 2]]) for levels_1, levels_2 in both_levels: assert all(compare(levels_1, levels_2)) assert dist.rvs.size == 3 assert dist[("A", "x", 1)] == 1 assert dist[("A", "y", 2)] == 2 assert dist[("B", "x", 1)] == 3 assert dist[("B", "y", 2)] == 4 assert all(compare(dist.frequencies(normalised=True), [0.1, 0.2, 0.3, 0.4])) assert all(compare(dist.frequencies(normalised=False), [1, 2, 3, 4])) assert dist.prob(X1="A", X2="x", X3=1) == 1 / 10 assert dist.prob(X1="A", X2="y", X3=2) == 2 / 10 assert dist.prob(X1="B", X2="x", X3=1) == 3 / 10 assert dist.prob(X1="B", X2="y", X3=2) == 4 / 10 dist = DiscreteDistribution({ ("A", "x", 1): 1, ("A", "y", 2): 2, ("B", "x", 1): 3, ("B", "y", 2): 4, ("C", "y", 3): 5, }) both_levels = zip(dist.levels(), [["A", "B", "C"], ["x", "y"], [1, 2, 3]]) for levels_1, levels_2 in both_levels: assert all(compare(levels_1, levels_2)) assert dist.rvs.size == 3 assert dist[("A", "x", 1)] == 1 assert dist[("A", "y", 2)] == 2 assert dist[("B", "x", 1)] == 3 assert dist[("B", "y", 2)] == 4 assert dist[("C", "y", 3)] == 5 assert all( compare(dist.frequencies(normalised=True), [1 / 15, 2 / 15, 3 / 15, 4 / 15, 5 / 15])) assert all(compare(dist.frequencies(normalised=False), [1, 2, 3, 4, 5])) assert dist.prob(X1="A", X2="x", X3=1) == 1 / 15 assert dist.prob(X1="A", X2="y", X3=2) == 2 / 15 assert dist.prob(X1="B", X2="x", X3=1) == 3 / 15 assert dist.prob(X1="B", X2="y", X3=2) == 4 / 15 assert dist.prob(X1="C", X2="y", X3=3) == 5 / 15 dist = DiscreteDistribution({ ("A", "x", 1): 1, ("A", "y", 2): 2, ("B", "x", 3): 3, ("B", "y", 3): 4, ("C", "z", 4): 5, }) both_levels = zip(dist.levels(), [["A", "B", "C"], ["x", "y", "z"], [1, 2, 3, 4]]) for levels_1, levels_2 in both_levels: assert all(compare(levels_1, levels_2)) assert dist.rvs.size == 3 assert dist[("A", "x", 1)] == 1 assert dist[("A", "y", 2)] == 2 assert dist[("B", "x", 3)] == 3 assert dist[("B", "y", 3)] == 4 assert dist[("C", "z", 4)] == 5 assert all( compare(dist.frequencies(normalised=True), [1 / 15, 2 / 15, 3 / 15, 4 / 15, 5 / 15])) assert all(compare(dist.frequencies(normalised=False), [1, 2, 3, 4, 5])) assert dist.prob(X1="A", X2="x", X3=1) == 1 / 15 assert dist.prob(X1="A", X2="y", X3=2) == 2 / 15 assert dist.prob(X1="B", X2="x", X3=3) == 3 / 15 assert dist.prob(X1="B", X2="y", X3=3) == 4 / 15 assert dist.prob(X1="C", X2="z", X3=4) == 5 / 15
def test_product_exceptions_discrete_distribution(): dist1 = DiscreteDistribution(sample_1) with pytest.raises(ValueError): dist1.product(2)
def test_statistical_independence_frequency_table(): # P(x,y,z) = P(x)P(y)P(z) # to check that, first, create a joint dist. by product # then marginalis and multiply again. The final must be equal # the joint # Note: the multi-variable distributions must be statistically # independent s_1 = { ("x", 1): 1 * 6, ("x", 2): 1 * 4, ("y", 1): 9 * 6, ("y", 2): 9 * 4, } dist1 = DiscreteDistribution(s_1, names=["X1", "X2"]) s_2 = { (1, "high", "under", "x"): 4 * 3 * 1 * 1, (1, "high", "normal", "x"): 4 * 3 * 2 * 1, (1, "high", "over", "x"): 4 * 3 * 3 * 1, (1, "high", "obese", "x"): 4 * 3 * 4 * 1, (1, "low", "under", "x"): 4 * 2 * 1 * 1, (1, "low", "normal", "x"): 4 * 2 * 2 * 1, (1, "low", "over", "x"): 4 * 2 * 3 * 1, (1, "low", "obese", "x"): 4 * 2 * 4 * 1, (2, "high", "under", "x"): 2 * 3 * 1 * 1, (2, "high", "normal", "x"): 2 * 3 * 2 * 1, (2, "high", "over", "x"): 2 * 3 * 3 * 1, (2, "high", "obese", "x"): 2 * 3 * 4 * 1, (2, "low", "under", "x"): 2 * 2 * 1 * 1, (2, "low", "normal", "x"): 2 * 2 * 2 * 1, (2, "low", "over", "x"): 2 * 2 * 3 * 1, (2, "low", "obese", "x"): 2 * 2 * 4 * 1, (1, "high", "under", "y"): 4 * 3 * 1 * 3, (1, "high", "normal", "y"): 4 * 3 * 2 * 3, (1, "high", "over", "y"): 4 * 3 * 3 * 3, (1, "high", "obese", "y"): 4 * 3 * 4 * 3, (1, "low", "under", "y"): 4 * 2 * 1 * 3, (1, "low", "normal", "y"): 4 * 2 * 2 * 3, (1, "low", "over", "y"): 4 * 2 * 3 * 3, (1, "low", "obese", "y"): 4 * 2 * 4 * 3, (2, "high", "under", "y"): 2 * 3 * 1 * 3, (2, "high", "normal", "y"): 2 * 3 * 2 * 3, (2, "high", "over", "y"): 2 * 3 * 3 * 3, (2, "high", "obese", "y"): 2 * 3 * 4 * 3, (2, "low", "under", "y"): 2 * 2 * 1 * 3, (2, "low", "normal", "y"): 2 * 2 * 2 * 3, (2, "low", "over", "y"): 2 * 2 * 3 * 3, (2, "low", "obese", "y"): 2 * 2 * 4 * 3, } dist2 = DiscreteDistribution(s_2, names=["Y1", "Y2", "Y3", "Y4"]) freq_table3 = FrequencyTable({11: 2, 22: 4, 33: 3}, name="Z") joint_dist = dist1 * dist2 * freq_table3 marginals = [] for name in joint_dist.names: names_except_one = list(set(joint_dist.names) - {name}) marginal = joint_dist.marginal(*names_except_one) marginals.append(marginal) joint_dist2 = np.product(marginals) for k1 in joint_dist: assert joint_dist.probability(k1) == joint_dist2.probability(k1) # Test by normalising the distributions dist1.normalise() dist2.normalise() freq_table3.normalise() joint_dist = dist1 * dist2 * freq_table3 marginals = [] for name in joint_dist.names: names_except_one = list(set(joint_dist.names) - {name}) marginal = joint_dist.marginal(*names_except_one) marginals.append(marginal) joint_dist2 = np.product(marginals) for k1 in joint_dist: assert joint_dist.probability(k1) == approx( joint_dist2.probability(k1), abs=1e-16) assert joint_dist[k1] == approx(joint_dist2[k1], abs=1e-16)
def test_reduce_by_name_discrete_distribution(): samples = { ("a", "x", 1, 33): 1, ("a", "x", 2, 33): 2, ("a", "x", 1, 44): 3, ("a", "x", 2, 44): 4, ("a", "y", 1, 33): 5, ("a", "y", 2, 33): 6, ("a", "y", 1, 44): 7, ("a", "y", 2, 44): 8, ("b", "x", 1, 33): 9, ("b", "x", 2, 33): 10, ("b", "x", 1, 44): 11, ("b", "x", 2, 44): 12, ("b", "y", 1, 33): 13, ("b", "y", 2, 33): 14, ("b", "y", 1, 44): 15, ("b", "y", 2, 44): 16, } disc_dist = DiscreteDistribution(samples) reduced_dist = disc_dist.reduce(X2="y") assert reduced_dist.rvs.size == 3 assert all(compare(reduced_dist.rvs.names, ["X1", "X3", "X4"])) assert reduced_dist[("a", 1, 33)] == 5 assert reduced_dist[("b", 2, 44)] == 16 assert reduced_dist.frequency(("a", 1, 33)) == 5 assert reduced_dist.frequency(("b", 2, 44)) == 16 assert reduced_dist.probability(("a", 1, 33)) == 5 / 84 assert reduced_dist.probability(("b", 2, 44)) == 16 / 84 reduced_dist = disc_dist.reduce(X2="y", X3=1) assert reduced_dist.rvs.size == 2 assert all(compare(reduced_dist.rvs.names, ["X1", "X4"])) assert reduced_dist[("a", 33)] == 5 assert reduced_dist[("b", 44)] == 15 assert reduced_dist.frequency(("a", 33)) == 5 assert reduced_dist.frequency(("b", 44)) == 15 assert reduced_dist.probability(("a", 33)) == 5 / 40 assert reduced_dist.probability(("b", 44)) == 15 / 40 reduced_dist = disc_dist.reduce(X1="b", X3=1, X4=44) assert reduced_dist.rvs.size == 1 assert all(compare(reduced_dist.rvs.names, ["X2"])) assert reduced_dist["x"] == 11 assert reduced_dist["y"] == 15 assert reduced_dist.frequency("x") == 11 assert reduced_dist.frequency("y") == 15 assert reduced_dist.probability("x") == 11 / 26 assert reduced_dist.probability("y") == 15 / 26 disc_dist = DiscreteDistribution(samples, names=["Y", "Z", "W", "X"]) reduced_dist = disc_dist.reduce(Z="y") assert reduced_dist.rvs.size == 3 assert all(compare(reduced_dist.rvs.names, ["Y", "W", "X"])) assert reduced_dist[("a", 1, 33)] == 5 assert reduced_dist[("b", 2, 44)] == 16 assert reduced_dist.frequency(("a", 1, 33)) == 5 assert reduced_dist.frequency(("b", 2, 44)) == 16 assert reduced_dist.probability(("a", 1, 33)) == 5 / 84 assert reduced_dist.probability(("b", 2, 44)) == 16 / 84 reduced_dist = disc_dist.reduce(Z="y", W=1) assert reduced_dist.rvs.size == 2 assert all(compare(reduced_dist.rvs.names, ["Y", "X"])) assert reduced_dist[("a", 33)] == 5 assert reduced_dist[("b", 44)] == 15 assert reduced_dist.frequency(("a", 33)) == 5 assert reduced_dist.frequency(("b", 44)) == 15 assert reduced_dist.probability(("a", 33)) == 5 / 40 assert reduced_dist.probability(("b", 44)) == 15 / 40 reduced_dist = disc_dist.reduce(Y="b", W=1, X=44) assert reduced_dist.rvs.size == 1 assert all(compare(reduced_dist.rvs.names, ["Z"])) assert reduced_dist["x"] == 11 assert reduced_dist["y"] == 15 assert reduced_dist.frequency("x") == 11 assert reduced_dist.frequency("y") == 15 assert reduced_dist.probability("x") == 11 / 26 assert reduced_dist.probability("y") == 15 / 26
def test_marginals_discrete_distribution(): # Single RV dist. with pytest.raises(ValueError): disc_dist = DiscreteDistribution({"A": 2, "B": 3, "C": 4}) disc_dist.marginal("X1") # Two levels dist. samples = {(1, 1): 4, (1, 2): 4, (2, 1): 6, (2, 2): 6} disc_dist = DiscreteDistribution(samples) disc_dist2 = disc_dist.marginal("X1") assert disc_dist2.total == disc_dist.total assert all(compare(disc_dist2.keys_as_list(), [1, 2])) assert disc_dist2[1] == 10 assert disc_dist2[2] == 10 assert disc_dist2.probability(1) == 0.5 assert disc_dist2.probability(2) == 0.5 disc_dist2 = disc_dist.marginal("X2") assert disc_dist2.total == disc_dist.total assert all(compare(disc_dist2.keys_as_list(), [1, 2])) assert disc_dist2[1] == 8 assert disc_dist2[2] == 12 assert disc_dist2.probability(1) == 0.4 assert disc_dist2.probability(2) == 0.6 samples = {("a", "x"): 4, ("a", "y"): 4, ("b", "x"): 6, ("b", "y"): 6} disc_dist = DiscreteDistribution(samples) disc_dist2 = disc_dist.marginal("X1") assert disc_dist2.total == disc_dist.total assert all(compare(disc_dist2.keys_as_list(), ["x", "y"])) assert disc_dist2["x"] == 10 assert disc_dist2["y"] == 10 assert disc_dist2.probability("x") == 0.5 assert disc_dist2.probability("y") == 0.5 disc_dist2 = disc_dist.marginal("X1") assert disc_dist2.total == disc_dist.total assert all(compare(disc_dist2.keys_as_list(), ["x", "y"])) assert disc_dist2["x"] == 10 assert disc_dist2["y"] == 10 assert disc_dist2.probability("x") == 0.5 assert disc_dist2.probability("y") == 0.5 disc_dist2 = disc_dist.marginal("X2") assert disc_dist2.total == disc_dist.total assert all(compare(disc_dist2.keys_as_list(), ["a", "b"])) assert disc_dist2["a"] == 8 assert disc_dist2["b"] == 12 assert disc_dist2.probability("a") == 0.4 assert disc_dist2.probability("b") == 0.6 # Three levels dist. samples = { ("a", "x", 1): 4, ("a", "x", 2): 4, ("a", "y", 1): 6, ("a", "y", 2): 6, ("b", "x", 1): 8, ("b", "x", 2): 8, ("b", "y", 1): 10, ("b", "y", 2): 10, } disc_dist = DiscreteDistribution(samples) disc_dist2 = disc_dist.marginal("X1") assert disc_dist2.total == disc_dist.total assert all( compare(disc_dist2.keys_as_list(), [("x", 1), ("x", 2), ("y", 1), ("y", 2)])) assert disc_dist2[("x", 1)] == 12 assert disc_dist2[("x", 2)] == 12 assert disc_dist2[("y", 1)] == 16 assert disc_dist2[("y", 2)] == 16 assert disc_dist2.probability(("x", 1)) == 12 / 56 assert disc_dist2.probability(("x", 2)) == 12 / 56 assert disc_dist2.probability(("y", 1)) == 16 / 56 assert disc_dist2.probability(("y", 2)) == 16 / 56 disc_dist2 = disc_dist.marginal("X2") assert disc_dist2.total == disc_dist.total assert all( compare(disc_dist2.keys_as_list(), [("a", 1), ("a", 2), ("b", 1), ("b", 2)])) assert disc_dist2[("a", 1)] == 10 assert disc_dist2[("a", 2)] == 10 assert disc_dist2[("b", 1)] == 18 assert disc_dist2[("b", 2)] == 18 assert disc_dist2.probability(("a", 1)) == 10 / 56 assert disc_dist2.probability(("a", 2)) == 10 / 56 assert disc_dist2.probability(("b", 1)) == 18 / 56 assert disc_dist2.probability(("b", 2)) == 18 / 56 disc_dist2 = disc_dist.marginal("X3") assert disc_dist2.total == disc_dist.total assert all( compare(disc_dist2.keys_as_list(), [("a", "x"), ("a", "y"), ("b", "x"), ("b", "y")])) assert disc_dist2[("a", "x")] == 8 assert disc_dist2[("a", "y")] == 12 assert disc_dist2[("b", "x")] == 16 assert disc_dist2[("b", "y")] == 20 assert disc_dist2.probability(("a", "x")) == 8 / 56 assert disc_dist2.probability(("a", "y")) == 12 / 56 assert disc_dist2.probability(("b", "x")) == 16 / 56 assert disc_dist2.probability(("b", "y")) == 20 / 56 disc_dist2 = disc_dist.marginal("X1", "X2") assert disc_dist2.total == disc_dist.total assert all(compare(disc_dist2.keys_as_list(), [1, 2])) assert disc_dist2[1] == 28 assert disc_dist2[2] == 28 assert disc_dist2.probability(1) == 28 / 56 assert disc_dist2.probability(2) == 28 / 56 disc_dist2 = disc_dist.marginal("X1", "X3") assert disc_dist2.total == disc_dist.total assert all(compare(disc_dist2.keys_as_list(), ["x", "y"])) assert disc_dist2["x"] == 24 assert disc_dist2["y"] == 32 assert disc_dist2.probability("x") == 24 / 56 assert disc_dist2.probability("y") == 32 / 56 disc_dist2 = disc_dist.marginal("X2", "X3") assert disc_dist2.total == disc_dist.total assert all(compare(disc_dist2.keys_as_list(), ["a", "b"])) assert disc_dist2["a"] == 20 assert disc_dist2["b"] == 36 assert disc_dist2.probability("a") == 20 / 56 assert disc_dist2.probability("b") == 36 / 56 # Four levels dist. samples = { ("a", "x", 1, 33): 1, ("a", "x", 2, 33): 2, ("a", "x", 1, 44): 3, ("a", "x", 2, 44): 4, ("a", "y", 1, 33): 5, ("a", "y", 2, 33): 6, ("a", "y", 1, 44): 7, ("a", "y", 2, 44): 8, ("b", "x", 1, 33): 9, ("b", "x", 2, 33): 10, ("b", "x", 1, 44): 11, ("b", "x", 2, 44): 12, ("b", "y", 1, 33): 13, ("b", "y", 2, 33): 14, ("b", "y", 1, 44): 15, ("b", "y", 2, 44): 16, } disc_dist = DiscreteDistribution(samples) disc_dist2 = disc_dist.marginal("X3") assert disc_dist2.total == disc_dist.total assert all( compare( disc_dist2.keys_as_list(), [ ("a", "x", 33), ("a", "x", 44), ("a", "y", 33), ("a", "y", 44), ("b", "x", 33), ("b", "x", 44), ("b", "y", 33), ("b", "y", 44), ], )) assert disc_dist2[("a", "x", 33)] == 3 assert disc_dist2[("a", "x", 44)] == 7 assert disc_dist2[("a", "y", 33)] == 11 assert disc_dist2[("a", "y", 44)] == 15 assert disc_dist2[("b", "x", 33)] == 19 assert disc_dist2[("b", "x", 44)] == 23 assert disc_dist2[("b", "y", 33)] == 27 assert disc_dist2[("b", "y", 44)] == 31 assert disc_dist2.probability(("a", "x", 33)) == 3 / 136 assert disc_dist2.probability(("a", "x", 44)) == 7 / 136 assert disc_dist2.probability(("a", "y", 33)) == 11 / 136 assert disc_dist2.probability(("a", "y", 44)) == 15 / 136 assert disc_dist2.probability(("b", "x", 33)) == 19 / 136 assert disc_dist2.probability(("b", "x", 44)) == 23 / 136 assert disc_dist2.probability(("b", "y", 33)) == 27 / 136 assert disc_dist2.probability(("b", "y", 44)) == 31 / 136 disc_dist2 = disc_dist.marginal("X4") assert disc_dist2.total == disc_dist.total assert all( compare( disc_dist2.keys_as_list(), [ ("a", "x", 1), ("a", "x", 2), ("a", "y", 1), ("a", "y", 2), ("b", "x", 1), ("b", "x", 2), ("b", "y", 1), ("b", "y", 2), ], )) assert disc_dist2[("a", "x", 1)] == 4 assert disc_dist2[("a", "x", 2)] == 6 assert disc_dist2[("a", "y", 1)] == 12 assert disc_dist2[("a", "y", 2)] == 14 assert disc_dist2[("b", "x", 1)] == 20 assert disc_dist2[("b", "x", 2)] == 22 assert disc_dist2[("b", "y", 1)] == 28 assert disc_dist2[("b", "y", 2)] == 30 assert disc_dist2.probability(("a", "x", 1)) == 4 / 136 assert disc_dist2.probability(("a", "x", 2)) == 6 / 136 assert disc_dist2.probability(("a", "y", 1)) == 12 / 136 assert disc_dist2.probability(("a", "y", 2)) == 14 / 136 assert disc_dist2.probability(("b", "x", 1)) == 20 / 136 assert disc_dist2.probability(("b", "x", 2)) == 22 / 136 assert disc_dist2.probability(("b", "y", 1)) == 28 / 136 assert disc_dist2.probability(("b", "y", 2)) == 30 / 136 disc_dist2 = disc_dist.marginal("X1", "X4") assert disc_dist2.total == disc_dist.total assert all( compare(disc_dist2.keys_as_list(), [("x", 1), ("x", 2), ("y", 1), ("y", 2)])) assert disc_dist2[("x", 1)] == 24 assert disc_dist2[("x", 2)] == 28 assert disc_dist2[("y", 1)] == 40 assert disc_dist2[("y", 2)] == 44 assert disc_dist2.probability(("x", 1)) == 24 / 136 assert disc_dist2.probability(("x", 2)) == 28 / 136 assert disc_dist2.probability(("y", 1)) == 40 / 136 assert disc_dist2.probability(("y", 2)) == 44 / 136 disc_dist2 = disc_dist.marginal("X1", "X2", "X4") assert disc_dist2.total == disc_dist.total assert all(compare(disc_dist2.keys_as_list(), [1, 2])) assert disc_dist2[1] == 64 assert disc_dist2[2] == 72 assert disc_dist2.probability(1) == 64 / 136 assert disc_dist2.probability(2) == 72 / 136 # marginalize two times disc_dist2 = disc_dist.marginal("X1", "X4") disc_dist3 = disc_dist2.marginal("X2") assert disc_dist3.total == disc_dist.total assert all(compare(disc_dist3.keys_as_list(), [1, 2])) assert disc_dist3[1] == 64 assert disc_dist3[2] == 72 assert disc_dist3.probability(1) == 64 / 136 assert disc_dist3.probability(2) == 72 / 136 # marginalize three times disc_dist2 = disc_dist.marginal("X4") disc_dist3 = disc_dist2.marginal("X3") disc_dist4 = disc_dist3.marginal("X2") assert disc_dist4.total == disc_dist.total assert all(compare(disc_dist4.keys_as_list(), ["a", "b"])) assert disc_dist4["a"] == 36 assert disc_dist4["b"] == 100 assert disc_dist4.probability("a") == 36 / 136 assert disc_dist4.probability("b") == 100 / 136
def test_avg_discrete_distribution(): samples = { (1, 1, 1): 1, (1, 1, 2): 1, (1, 1, 3): 1, (1, 2, 1): 2, (1, 2, 2): 2, (1, 2, 3): 2, (1, 3, 1): 3, (1, 3, 2): 3, (1, 3, 3): 3, } dist = DiscreteDistribution(samples) assert all(compare(dist.avg(), [1, (3 + 12 + 27) / 18, 2])) assert all(compare(dist.avg(indices=[0, 1, 2]), [1, (3 + 12 + 27) / 18, 2])) assert all(compare(dist.avg(indices=[0, 2, 1]), [1, 2, (3 + 12 + 27) / 18])) assert all(compare(dist.avg(indices=[0, 1]), [1, (3 + 12 + 27) / 18])) assert all(compare(dist.avg(indices=[0, 2]), [1, 2])) assert all(compare(dist.avg(indices=[2, 0]), [2, 1])) assert all(compare(dist.avg(indices=[1, 2]), [(3 + 12 + 27) / 18, 2])) assert dist.avg(indices=[0]) == 1 assert dist.std(indices=[0]) == 0 assert dist.avg(indices=[1]) == (3 + 12 + 27) / 18 assert dist.std(indices=[1]) == approx(0.55555555555556) assert dist.avg(indices=[2]) == 2 assert dist.std(indices=[2]) == approx(0.66666666666667)
def test_keys_consistencies_discrete_distribution(): with pytest.raises(ValueError): DiscreteDistribution([1, 2, 3, "A"], ["X1"], consistencies=True) with pytest.raises(ValueError): DiscreteDistribution(["A", 1, 2, 3], ["X1"], consistencies=True) with pytest.raises(ValueError): DiscreteDistribution([(1, ), (2, ), (3, ), (4, 5)], ["X1"], consistencies=True) with pytest.raises(ValueError): DiscreteDistribution([(4, 5), (1, ), (2, ), (3, )], ["X1"], consistencies=True) with pytest.raises(ValueError): DiscreteDistribution([(4, 5), (1, 3), (2, 3, 4), (3, 7)], ["X1"], consistencies=True) with pytest.raises(ValueError): DiscreteDistribution( [("a", "1", "w1"), ("b", 2, "w1"), ("c", 3, "w2"), ("d", 4, "w2")], ["X1", "X2", "X3"], consistencies=True, ) with pytest.raises(ValueError): DiscreteDistribution( [("a", 1, "w1"), ("b", "2", "w1"), ("c", 3, "w2"), ("d", 4, "w2")], ["X1", "X2", "X3"], consistencies=True, ) with pytest.raises(ValueError): DiscreteDistribution( [("a", 1, "w1"), ("b", 2, "w1"), ("c", 3, "w2"), ("d", "4", "w2")], ["X1", "X2", "X3"], consistencies=True, ) with pytest.raises(ValueError): DiscreteDistribution( [("a", 1, "w1"), ("b", 2), ("c", 3, "w2"), ("d", "4", "w2")], ["X1", "X2", "X3"], consistencies=True, ) with pytest.raises(ValueError): DiscreteDistribution( [(1, "w1"), ("b", 2, "w1"), ("c", 3, "w2"), ("d", "4", "w2")], ["X1", "X2", "X3"], consistencies=True, ) with pytest.raises(ValueError): DiscreteDistribution( [("a", None, "w1"), ("b", 2, "w1"), ("c", 3, "w2"), ("d", "4", "w2")], ["X1", "X2", "X3"], consistencies=True, ) with pytest.raises(ValueError): DiscreteDistribution( [("a", 1, "w1", None), ("b", 2, "w1", 2), ("c", 3, "w2", 1)], ["X1", "X2", "X3", "X4"], consistencies=True, ) with pytest.raises(ValueError): DiscreteDistribution( [("a", 1, "w1", 4), ("b", None, "w1", 2), ("c", 3, "w2", 1)], ["X1", "X2", "X3", "X4"], consistencies=True, ) with pytest.raises(ValueError): DiscreteDistribution( [("a", 1, "w1", "4"), ("b", 2, "w1", 2), ("c", 3, "w2", 1)], ["X1", "X2", "X3", "X4"], consistencies=True, ) with pytest.raises(ValueError): DiscreteDistribution( [("a", 1, 1, 4), ("b", 2, "w1", 2), ("c", 3, "w2", 1)], ["X1", "X2", "X3", "X4"], consistencies=True, ) with pytest.raises(ValueError): DiscreteDistribution( [("a", 1, "w1", 4), ("b", "2", "w1", 2), ("c", 3, "w2", 1)], ["X1", "X2", "X3", "X4"], consistencies=True, ) with pytest.raises(ValueError): DiscreteDistribution( [("a", 1, "w1", 4), (1, 2, "w1", 2), ("c", 3, "w2", 1)], ["X1", "X2", "X3", "X4"], consistencies=True, ) with pytest.raises(ValueError): DiscreteDistribution( [("a", 1, "w1", 4), ("b", 2, "w1", 2), ("c", "3", "w2", "1")], ["X1", "X2", "X3", "X4"], consistencies=True, ) with pytest.raises(ValueError): DiscreteDistribution( [("a", 1, "w1", 4), ("b", 2, 2), ("c", 3, "w2", 1)], ["X1", "X2", "X3", "X4"], consistencies=True, )
def test_marginal_by_name_discrete_distribution(): # Four levels dist. samples = { ("a", "x", 1, 33): 1, ("a", "x", 2, 33): 2, ("a", "x", 1, 44): 3, ("a", "x", 2, 44): 4, ("a", "y", 1, 33): 5, ("a", "y", 2, 33): 6, ("a", "y", 1, 44): 7, ("a", "y", 2, 44): 8, ("b", "x", 1, 33): 9, ("b", "x", 2, 33): 10, ("b", "x", 1, 44): 11, ("b", "x", 2, 44): 12, ("b", "y", 1, 33): 13, ("b", "y", 2, 33): 14, ("b", "y", 1, 44): 15, ("b", "y", 2, 44): 16, } disc_dist = DiscreteDistribution(samples, names=["Age", "Sex", "Edu", "Etn"]) disc_dist2 = disc_dist.marginal("Edu") assert disc_dist2.total == disc_dist.total assert all( compare( disc_dist2.keys_as_list(), [ ("a", "x", 33), ("a", "x", 44), ("a", "y", 33), ("a", "y", 44), ("b", "x", 33), ("b", "x", 44), ("b", "y", 33), ("b", "y", 44), ], )) assert disc_dist2[("a", "x", 33)] == 3 assert disc_dist2[("a", "x", 44)] == 7 assert disc_dist2[("a", "y", 33)] == 11 assert disc_dist2[("a", "y", 44)] == 15 assert disc_dist2[("b", "x", 33)] == 19 assert disc_dist2[("b", "x", 44)] == 23 assert disc_dist2[("b", "y", 33)] == 27 assert disc_dist2[("b", "y", 44)] == 31 assert disc_dist2.probability(("a", "x", 33)) == 3 / 136 assert disc_dist2.probability(("a", "x", 44)) == 7 / 136 assert disc_dist2.probability(("a", "y", 33)) == 11 / 136 assert disc_dist2.probability(("a", "y", 44)) == 15 / 136 assert disc_dist2.probability(("b", "x", 33)) == 19 / 136 assert disc_dist2.probability(("b", "x", 44)) == 23 / 136 assert disc_dist2.probability(("b", "y", 33)) == 27 / 136 assert disc_dist2.probability(("b", "y", 44)) == 31 / 136 disc_dist2 = disc_dist.marginal("Etn") assert disc_dist2.total == disc_dist.total assert all( compare( disc_dist2.keys_as_list(), [ ("a", "x", 1), ("a", "x", 2), ("a", "y", 1), ("a", "y", 2), ("b", "x", 1), ("b", "x", 2), ("b", "y", 1), ("b", "y", 2), ], )) assert disc_dist2[("a", "x", 1)] == 4 assert disc_dist2[("a", "x", 2)] == 6 assert disc_dist2[("a", "y", 1)] == 12 assert disc_dist2[("a", "y", 2)] == 14 assert disc_dist2[("b", "x", 1)] == 20 assert disc_dist2[("b", "x", 2)] == 22 assert disc_dist2[("b", "y", 1)] == 28 assert disc_dist2[("b", "y", 2)] == 30 assert disc_dist2.probability(("a", "x", 1)) == 4 / 136 assert disc_dist2.probability(("a", "x", 2)) == 6 / 136 assert disc_dist2.probability(("a", "y", 1)) == 12 / 136 assert disc_dist2.probability(("a", "y", 2)) == 14 / 136 assert disc_dist2.probability(("b", "x", 1)) == 20 / 136 assert disc_dist2.probability(("b", "x", 2)) == 22 / 136 assert disc_dist2.probability(("b", "y", 1)) == 28 / 136 assert disc_dist2.probability(("b", "y", 2)) == 30 / 136 disc_dist2 = disc_dist.marginal("Age", "Etn") assert disc_dist2.total == disc_dist.total assert all( compare(disc_dist2.keys_as_list(), [("x", 1), ("x", 2), ("y", 1), ("y", 2)])) assert disc_dist2[("x", 1)] == 24 assert disc_dist2[("x", 2)] == 28 assert disc_dist2[("y", 1)] == 40 assert disc_dist2[("y", 2)] == 44 assert disc_dist2.probability(("x", 1)) == 24 / 136 assert disc_dist2.probability(("x", 2)) == 28 / 136 assert disc_dist2.probability(("y", 1)) == 40 / 136 assert disc_dist2.probability(("y", 2)) == 44 / 136 disc_dist2 = disc_dist.marginal("Age", "Sex", "Etn") assert disc_dist2.total == disc_dist.total assert all(compare(disc_dist2.keys_as_list(), [1, 2])) assert disc_dist2[1] == 64 assert disc_dist2[2] == 72 assert disc_dist2.probability(1) == 64 / 136 assert disc_dist2.probability(2) == 72 / 136 # marginalize two times disc_dist2 = disc_dist.marginal("Age", "Etn") disc_dist3 = disc_dist2.marginal("Sex") assert disc_dist3.total == disc_dist.total assert all(compare(disc_dist3.keys_as_list(), [1, 2])) assert disc_dist3[1] == 64 assert disc_dist3[2] == 72 assert disc_dist3.probability(1) == 64 / 136 assert disc_dist3.probability(2) == 72 / 136 # marginalize three times disc_dist2 = disc_dist.marginal("Etn") disc_dist3 = disc_dist2.marginal("Edu") disc_dist4 = disc_dist3.marginal("Sex") assert disc_dist4.total == disc_dist.total assert all(compare(disc_dist4.keys_as_list(), ["a", "b"])) assert disc_dist4["a"] == 36 assert disc_dist4["b"] == 100 assert disc_dist4.probability("a") == 36 / 136 assert disc_dist4.probability("b") == 100 / 136
def from_multilevels_sample(cls, samples, names=None): ft = DiscreteDistribution(samples, names) return cls(ft)
def test_one_levels_discrete_distribution(): dist = DiscreteDistribution({"Dog": 2}) assert all(compare(dist.keys_as_list(), ["Dog"])) assert dist.rvs.size == 1 assert dist["Dog"] == 2 assert dist["Cat"] == 0 assert all(compare(dist.frequencies(normalised=True), [1])) assert all(compare(dist.frequencies(normalised=False), [2])) assert dist.prob("Dog") == 1 assert dist.prob(X1="Dog") == 1 dist = DiscreteDistribution({"Dog": 2, "Cat": 3}) assert all(compare(dist.keys_as_list(), ["Dog", "Cat"])) assert dist.rvs.size == 1 assert dist["Dog"] == 2 assert dist["Cat"] == 3 assert dist["Dolphin"] == 0 assert all(compare(dist.frequencies(normalised=True), [2 / 5, 3 / 5])) assert all(compare(dist.frequencies(normalised=False), [2, 3])) assert dist.prob("Dog") == 2 / 5 assert dist.prob(X1="Dog") == 2 / 5 assert dist.prob("Cat") == 3 / 5 assert dist.prob(X1="Cat") == 3 / 5 assert dist.prob("Dolphin") == 0 assert dist.prob(X1="Dolphin") == 0 dist = DiscreteDistribution({"Dog": 2, "Cat": 3, "Dolphin": 4}) assert all(compare(dist.keys_as_list(), ["Dog", "Cat", "Dolphin"])) assert dist.rvs.size == 1 assert dist["Dog"] == 2 assert dist["Cat"] == 3 assert dist["Dolphin"] == 4 assert dist["Tiger"] == 0 assert all( compare(dist.frequencies(normalised=True), [2 / 9, 3 / 9, 4 / 9])) assert all(compare(dist.frequencies(normalised=False), [2, 3, 4])) assert dist.prob("Dog") == 2 / 9 assert dist.prob(X1="Dog") == 2 / 9 assert dist.prob("Cat") == 3 / 9 assert dist.prob(X1="Cat") == 3 / 9 assert dist.prob("Dolphin") == 4 / 9 assert dist.prob(X1="Dolphin") == 4 / 9 assert dist.prob("Tiger") == 0 assert dist.prob(X1="Tiger") == 0
def test_marginals_names_exception_discrete_distribution(): # Wrong rv name with pytest.raises(ValueError): samples = {"a": 3, "b": 4, "c": 5} disc_dist = DiscreteDistribution(samples) disc_dist.marginal("X1") # Wrong rv name with pytest.raises(ValueError): samples = {("a", "x"): 4, ("a", "y"): 4, ("b", "x"): 6, ("b", "y"): 6} disc_dist = DiscreteDistribution(samples) disc_dist.marginal("X0") # Wrong rv name with pytest.raises(ValueError): samples = {("a", "x"): 4, ("a", "y"): 4, ("b", "x"): 6, ("b", "y"): 6} disc_dist = DiscreteDistribution(samples) disc_dist.marginal("X3") # Wrong rv name with pytest.raises(ValueError): samples = {("a", "x"): 4, ("a", "y"): 4, ("b", "x"): 6, ("b", "y"): 6} disc_dist = DiscreteDistribution(samples) disc_dist2 = disc_dist.marginal("X1") disc_dist2.marginal("X1") # Wrong rv name with pytest.raises(ValueError): samples = {("a", "x"): 4, ("a", "y"): 4, ("b", "x"): 6, ("b", "y"): 6} disc_dist = DiscreteDistribution(samples, names=["Y", "Z"]) disc_dist.marginal("X1") # Wrong rv name with pytest.raises(ValueError): samples = {("a", "x"): 4, ("a", "y"): 4, ("b", "x"): 6, ("b", "y"): 6} disc_dist = DiscreteDistribution(samples, names=["Y", "Z"]) disc_dist.marginal("X1") # Wrong rv name with pytest.raises(ValueError): samples = {("a", "x"): 4, ("a", "y"): 4, ("b", "x"): 6, ("b", "y"): 6} disc_dist = DiscreteDistribution(samples, names=["Y", "Z"]) disc_dist2 = disc_dist.marginal("Y") disc_dist2.marginal("Y") # Marginalize over all vars with pytest.raises(ValueError): samples = {("a", "x"): 4, ("a", "y"): 4, ("b", "x"): 6, ("b", "y"): 6} disc_dist = DiscreteDistribution(samples, names=["Y", "Z"]) disc_dist2 = disc_dist.marginal("Y", "Z")
def test_marginals_names_discrete_distribution(): samples = {("a", "x"): 4, ("a", "y"): 4, ("b", "x"): 6, ("b", "y"): 6} disc_dist = DiscreteDistribution(samples) disc_dist2 = disc_dist.marginal("X1") assert all(compare(disc_dist2.names, ["X2"])) disc_dist2 = disc_dist.marginal("X2") assert all(compare(disc_dist2.names, ["X1"])) # disc_dist = DiscreteDistribution(samples, names=["Y", "Z"]) disc_dist2 = disc_dist.marginal("Y") assert all(compare(disc_dist2.names, ["Z"])) disc_dist2 = disc_dist.marginal("Z") assert all(compare(disc_dist2.names, ["Y"])) # Three levels dist. samples = { ("a", "x", 1): 4, ("a", "x", 2): 4, ("a", "y", 1): 6, ("a", "y", 2): 6, ("b", "x", 1): 8, ("b", "x", 2): 8, ("b", "y", 1): 10, ("b", "y", 2): 10, } disc_dist = DiscreteDistribution(samples) disc_dist2 = disc_dist.marginal("X1") assert all(compare(disc_dist2.names, ["X2", "X3"])) disc_dist2 = disc_dist.marginal("X2") assert all(compare(disc_dist2.names, ["X1", "X3"])) disc_dist2 = disc_dist.marginal("X3") assert all(compare(disc_dist2.names, ["X1", "X2"])) disc_dist2 = disc_dist.marginal("X1", "X3") assert all(compare(disc_dist2.names, ["X2"])) disc_dist2 = disc_dist.marginal("X2", "X3") assert all(compare(disc_dist2.names, ["X1"])) # disc_dist = DiscreteDistribution(samples, names=["Y", "Z", "W"]) disc_dist2 = disc_dist.marginal("Y") assert all(compare(disc_dist2.names, ["Z", "W"])) disc_dist2 = disc_dist.marginal("Z") assert all(compare(disc_dist2.names, ["Y", "W"])) disc_dist2 = disc_dist.marginal("W") assert all(compare(disc_dist2.names, ["Y", "Z"])) disc_dist2 = disc_dist.marginal("Y", "W") assert all(compare(disc_dist2.names, ["Z"])) disc_dist2 = disc_dist.marginal("Z", "W") assert all(compare(disc_dist2.names, ["Y"]))