def test_joint_variables__unequal_numbers_of_instances():
    # Variable animals has 6 instances.
    animals = Variable(['cat', 'dog', 'cat', 'mouse', 'dog', 'cat'])
    animals.ID = 3
    animals.name = 'animals'

    # Variable colors has 6 instances.
    colors = Variable(['gray', 'yellow', 'brown', 'silver', 'white', 'gray'])
    colors.ID = 2
    colors.name = 'colors'

    # Variable sizes has only 5 instances, which will cause an error.
    sizes = Variable(['small', 'small', 'large', 'small', 'normal'])
    sizes.ID = 1
    sizes.name = 'sizes'

    with pytest.raises(VariableInstancesOfUnequalCount):
        fauna = JointVariables(animals, colors, sizes)

    sizes = Variable(['small', 'small', 'large', 'small', 'normal', 'small'])
    sizes.ID = 1
    sizes.name = 'sizes'

    fauna = JointVariables(animals, colors, sizes)

    can_fly = Variable([False, False, False, False, False])
    can_fly.ID = 4
    can_fly.name = 'can_fly'

    with pytest.raises(VariableInstancesOfUnequalCount):
        JointVariables(fauna, can_fly)
Exemple #2
0
    def make_pmfs_from_datasetmatrix(self, X: int, Y: int, Zl: list[int]) -> tuple[CPMF, CPMF, CPMF, PMF]:
        PrZ: PMF
        PrXcZ: CPMF
        PrYcZ: CPMF
        PrXYcZ: CPMF

        (VarX, VarY, VarZ) = self.load_variables(X, Y, Zl)
        if len(Zl) == 0:
            PrXY = PMF(JointVariables(VarX, VarY))
            PrX = PMF(VarX)
            PrY = PMF(VarY)
            PrZ = OmegaPMF()
            PrXYcZ = OmegaCPMF(PrXY)
            PrXcZ = OmegaCPMF(PrX)
            PrYcZ = OmegaCPMF(PrY)

        else:
            PrXYZ = PMF(JointVariables(VarX, VarY, VarZ))
            PrXZ = PMF(JointVariables(VarX, VarZ))
            PrYZ = PMF(JointVariables(VarY, VarZ))
            PrZ = PMF(VarZ)

            PrXcZ = PrXZ.condition_on(PrZ)
            PrYcZ = PrYZ.condition_on(PrZ)
            PrXYcZ = PrXYZ.condition_on(PrZ)

        return (PrXYcZ, PrXcZ, PrYcZ, PrZ)
def test_conditional_pmf__multiple_values():
    sizes = Variable(['small', 'small', 'large', 'small', 'normal', 'small'])
    sizes.ID = 1
    sizes.name = 'sizes'

    colors = Variable(['gray', 'yellow', 'brown', 'silver', 'white', 'gray'])
    colors.ID = 2
    colors.name = 'colors'

    animals = Variable(['cat', 'dog', 'cat', 'snake', 'dog', 'cat'])
    animals.ID = 3
    animals.name = 'animals'

    is_pet = Variable(['yes', 'yes', 'yes', 'maybe', 'yes', 'yes'])
    is_pet.ID = 4
    is_pet.name = 'is_pet'

    Pr = CPMF(JointVariables(colors, is_pet), JointVariables(sizes, animals))

    assert Pr.given('small', 'cat').p('gray', 'yes') == 2 / 2
    assert Pr.given('small', 'cat').p('yellow', 'yes') == 0 / 1
    assert Pr.given('small', 'cat').p('brown', 'maybe') == 0 / 1

    assert Pr.given('small', 'dog').p('yellow', 'yes') == 1 / 1
    assert Pr.given('small', 'dog').p('yellow', 'maybe') == 0 / 1
    assert Pr.given('small', 'dog').p('silver', 'maybe') == 0 / 1

    assert Pr.given('large', 'cat').p('brown', 'yes') == 1 / 1
    assert Pr.given('large', 'cat').p('yellow', 'yes') == 0 / 1

    assert Pr.given('small', 'snake').p('silver', 'maybe') == 1 / 1
    assert Pr.given('small', 'snake').p('silver', 'no') == 0 / 1

    assert Pr.given('normal', 'dog').p('white', 'yes') == 1 / 1
    assert Pr.given('normal', 'dog').p('silver', 'yes') == 0 / 1
    assert Pr.given('normal', 'dog').p('yellow', 'maybe') == 0 / 1

    SA = JointVariables(sizes, animals)
    PrAll = CPMF(JointVariables(colors, is_pet), SA)
    PrSA = PMF(SA)
    PrCcSA = CPMF(colors, SA)
    PrIPcSA = CPMF(is_pet, SA)

    test_p_all = 0.0
    test_p_c = 0.0
    test_p_ip = 0.0

    for (sa, psa) in PrSA.items():
        for (c, pcsa) in PrCcSA.given(sa).items():
            test_p_c += pcsa * PrSA.p(sa)
            for (ip, pipsa) in PrIPcSA.given(sa).items():
                pall = PrAll.given(sa).p(c, ip)
                test_p_all += pall * PrSA.p(sa)
                test_p_ip += pipsa * PrSA.p(sa)

    assert almostEqual(1, test_p_all)
    assert almostEqual(1, test_p_c)
    assert almostEqual(1, test_p_ip)
def test_conditional_pmf__binary():
    V0 = Variable([0, 1, 0, 1, 0, 1, 0, 1])
    V1 = Variable([0, 0, 1, 1, 0, 0, 1, 1])
    V2 = Variable([0, 0, 0, 0, 1, 1, 1, 1])
    V78 = Variable([0, 0, 0, 0, 0, 0, 1, 1])

    Pr = CPMF(V0, V78)
    assert Pr.given(0).p(0) == 3 / 6
    assert Pr.given(0).p(1) == 3 / 6
    assert Pr.given(1).p(0) == 1 / 2
    assert Pr.given(1).p(1) == 1 / 2

    Pr = CPMF(V2, V78)
    assert Pr.given(0).p(0) == 4 / 6
    assert Pr.given(0).p(1) == 2 / 6
    assert Pr.given(1).p(0) == 0 / 2
    assert Pr.given(1).p(1) == 2 / 2

    Pr = CPMF(V78, V1)
    assert Pr.given(0).p(0) == 4 / 4
    assert Pr.given(0).p(1) == 0 / 4
    assert Pr.given(1).p(0) == 2 / 4
    assert Pr.given(1).p(1) == 2 / 4

    Pr = CPMF(V1, JointVariables(V2, V78))
    assert Pr.given(0, 0).p(0) == 2 / 4
    assert Pr.given(0, 0).p(1) == 2 / 4
    assert Pr.given(0, 1).p(0) == 0 / 1
    assert Pr.given(0, 1).p(1) == 0 / 1
    assert Pr.given(1, 0).p(0) == 2 / 2
    assert Pr.given(1, 0).p(1) == 0 / 2
    assert Pr.given(1, 1).p(0) == 0 / 2
    assert Pr.given(1, 1).p(1) == 2 / 2
def test_joint_variables_pmf():
    animals = Variable(['cat', 'dog', 'cat', 'mouse', 'dog', 'cat'])
    animals.ID = 3
    animals.name = 'animals'

    colors = Variable(['gray', 'yellow', 'brown', 'silver', 'white', 'gray'])
    colors.ID = 2
    colors.name = 'colors'

    sizes = Variable(['small', 'small', 'large', 'small', 'normal', 'small'])
    sizes.ID = 1
    sizes.name = 'sizes'

    fauna = JointVariables(sizes, colors, animals)
    fauna.update_values()
    assert [1, 2, 3] == fauna.variableIDs
    assert fauna.variables[0] is sizes
    assert fauna.variables[1] is colors
    assert fauna.variables[2] is animals

    expected_values = [('large', 'brown', 'cat'),
                       ('normal', 'white', 'dog'),
                       ('small', 'gray', 'cat'),
                       ('small', 'silver', 'mouse'),
                       ('small', 'yellow', 'dog')]
    assert fauna.values == expected_values

    PrFauna = PMF(fauna)
    assert PrFauna.p('small', 'gray', 'cat') == 2 / 6
    assert PrFauna.p('small', 'silver', 'mouse') == 1 / 6
    assert PrFauna.p('small', 'silver', 'dog') == 0

    singleton_joint = JointVariables(animals)
    assert ['cat', 'dog', 'cat', 'mouse', 'dog', 'cat'] == singleton_joint.instances()
Exemple #6
0
def test_G_value__lungcancer(ds_lungcancer_4e4):
    Omega = ds_lungcancer_4e4.omega
    lungcancer = ds_lungcancer_4e4.datasetmatrix
    bn = ds_lungcancer_4e4.bayesiannetwork

    ASIA = lungcancer.get_variable('X', 0)
    BRONC = lungcancer.get_variable('X', 1)
    DYSP = lungcancer.get_variable('X', 2)
    EITHER = lungcancer.get_variable('X', 3)
    LUNG = lungcancer.get_variable('X', 4)
    SMOKE = lungcancer.get_variable('X', 5)
    TUB = lungcancer.get_variable('X', 6)
    XRAY = lungcancer.get_variable('X', 7)

    parameters = dict()
    parameters['ci_test_significance'] = 0.95
    parameters['ci_test_debug'] = 0
    parameters['omega'] = Omega
    parameters['source_bayesian_network'] = bn
    parameters[
        'ci_test_dof_calculator_class'] = mbtk.math.DoFCalculators.StructuralDoF

    G_test = mbtk.math.G_test__unoptimized.G_test(lungcancer, parameters)

    assertCITestAccurate(G_test, ASIA, SMOKE, Omega)
    assertCITestAccurate(G_test, ASIA, LUNG, Omega)
    assertCITestAccurate(G_test, ASIA, BRONC, Omega)
    assertCITestAccurate(G_test, ASIA, TUB, Omega)
    assertCITestAccurate(G_test, ASIA, EITHER, Omega)
    assertCITestAccurate(G_test, ASIA, XRAY, Omega)
    assertCITestAccurate(G_test, EITHER, ASIA, JointVariables(TUB, LUNG))
    assertCITestAccurate(G_test, EITHER, SMOKE, JointVariables(TUB, LUNG))
    assertCITestAccurate(G_test, DYSP, SMOKE, JointVariables(EITHER, BRONC))
    assertCITestAccurate(G_test, DYSP, LUNG, JointVariables(EITHER, BRONC))
    assertCITestAccurate(G_test, DYSP, TUB, JointVariables(EITHER, BRONC))
    assertCITestAccurate(G_test, XRAY, TUB, EITHER)
    assertCITestAccurate(G_test, XRAY, LUNG, EITHER)
    assertCITestAccurate(G_test, XRAY, ASIA, EITHER)
    assertCITestAccurate(G_test, XRAY, SMOKE, EITHER)
    assertCITestAccurate(G_test, XRAY, DYSP, EITHER)
    assertCITestAccurate(G_test, XRAY, BRONC, EITHER)
    assertCITestAccurate(G_test, XRAY, EITHER, Omega)
    assertCITestAccurate(G_test, XRAY, LUNG, Omega)
    assertCITestAccurate(G_test, XRAY, SMOKE, Omega)
    assertCITestAccurate(G_test, XRAY, TUB, Omega)
Exemple #7
0
def calculate_pmf_for_cmi(
    X: Variable,
    Y: Variable,
    Z: Union[Variable, JointVariables],
) -> tuple[CPMF, CPMF, CPMF, PMF]:

    PrXYcZ = CPMF(JointVariables(X, Y), Z)
    PrXcZ = CPMF(X, Z)
    PrYcZ = CPMF(Y, Z)
    PrZ = PMF(Z)

    return (PrXYcZ, PrXcZ, PrYcZ, PrZ)
def test_make_cpmf_PrXcZ_variant_1() -> None:
    V0 = Variable([0, 1, 1, 1, 0, 1, 0, 1])
    V1 = Variable([0, 0, 1, 1, 0, 1, 1, 1])

    PrXZ = PMF(JointVariables(V0, V1))
    PrXZ.IDs(1000, 1111)

    assert PrXZ.IDs() == (1000, 1111)

    assert PrXZ.p((0, 0)) == 2 / 8
    assert PrXZ.p((0, 1)) == 1 / 8
    assert PrXZ.p((1, 0)) == 1 / 8
    assert PrXZ.p((1, 1)) == 4 / 8
def test_pmf_summing_over_variable():
    V0 = Variable([0, 1, 1, 1, 0, 1, 0, 1])
    V1 = Variable([0, 0, 1, 1, 0, 1, 1, 1])
    V2 = Variable([0, 0, 0, 0, 1, 0, 1, 1])
    V3 = Variable([0, 0, 0, 0, 0, 0, 1, 1])

    V0.ID = 1000
    V1.ID = 1111
    V2.ID = 1222
    V3.ID = 1333

    Pr = PMF(JointVariables(V0, V1, V2, V3))
    assert Pr.IDs() == (1000, 1111, 1222, 1333)

    assert Pr.p((0, 0, 0, 0)) == 1 / 8
    assert Pr.p((1, 0, 0, 0)) == 1 / 8
    assert Pr.p((1, 1, 0, 0)) == 3 / 8
    assert Pr.p((0, 0, 1, 0)) == 1 / 8
    assert Pr.p((0, 1, 1, 1)) == 1 / 8
    assert Pr.p((1, 1, 1, 1)) == 1 / 8

    Pr = Pr.sum_over(V2.ID)
    assert sum(Pr.probabilities.values()) == 1

    assert Pr.p((0, 0, 0)) == 2 / 8
    assert Pr.p((1, 0, 0)) == 1 / 8
    assert Pr.p((1, 1, 0)) == 3 / 8
    assert Pr.p((0, 1, 1)) == 1 / 8
    assert Pr.p((1, 1, 1)) == 1 / 8
    assert Pr.IDs() == (V0.ID, V1.ID, V3.ID)

    Pr = Pr.sum_over(V1.ID)
    assert sum(Pr.probabilities.values()) == 1

    assert Pr.p((0, 0)) == 2 / 8
    assert Pr.p((1, 0)) == 4 / 8
    assert Pr.p((0, 1)) == 1 / 8
    assert Pr.p((1, 1)) == 1 / 8
    assert Pr.IDs() == (V0.ID, V3.ID)

    Pr = Pr.sum_over(V0.ID)
    assert sum(Pr.probabilities.values()) == 1

    print(Pr.probabilities)

    assert Pr.p(0) == 6 / 8
    assert Pr.p(1) == 2 / 8
    assert Pr.IDs() == (V3.ID,)
Exemple #10
0
    def get_variables(self, matrix_label, columns):
        if columns is None:
            return None

        if isinstance(columns, int):
            column = columns
            return self.get_variable(matrix_label, column)

        if isinstance(columns, list) and len(columns) == 1:
            column = columns[0]
            return self.get_variable(matrix_label, column)

        variables = []
        for column in columns:
            variables.append(self.get_variable(matrix_label, column))
        return JointVariables(*variables)
Exemple #11
0
    def G_test_conditionally_independent(self, X: int, Y: int,
                                         Z: list[int]) -> CITestResult:
        (VarX, VarY, VarZ) = self.load_variables(X, Y, Z)

        result = CITestResult()
        result.start_timing()

        PrZ: PMF
        PrXcZ: CPMF
        PrYcZ: CPMF
        PrXYcZ: CPMF

        if len(Z) == 0:
            PrXY = PMF(JointVariables(VarX, VarY))
            PrX = PMF(VarX)
            PrY = PMF(VarY)
            PrZ = OmegaPMF()
            PrXYcZ = OmegaCPMF(PrXY)
            PrXcZ = OmegaCPMF(PrX)
            PrYcZ = OmegaCPMF(PrY)

            if self.DoF_calculator.requires_pmfs:
                self.DoF_calculator.set_context_pmfs(PrXY, PrX, PrY, None)

        else:
            PrXYZ = PMF(JointVariables(VarX, VarY, VarZ))
            PrXZ = PMF(JointVariables(VarX, VarZ))
            PrYZ = PMF(JointVariables(VarY, VarZ))
            PrZ = PMF(VarZ)

            PrXcZ = PrXZ.condition_on(PrZ)
            PrYcZ = PrYZ.condition_on(PrZ)
            PrXYcZ = PrXYZ.condition_on(PrZ)

            if self.DoF_calculator.requires_pmfs:
                self.DoF_calculator.set_context_pmfs(PrXYZ, PrXZ, PrYZ, PrZ)

        self.DoF_calculator.set_context_variables(X, Y, Z)

        if self.DoF_calculator.requires_cpmfs:
            self.DoF_calculator.set_context_cpmfs(PrXYcZ, PrXcZ, PrYcZ, PrZ)

        DoF = self.DoF_calculator.calculate_DoF(X, Y, Z)

        if not self.sufficient_samples(DoF):
            result.end_timing()
            result.index = self.ci_test_counter + 1
            result.set_insufficient_samples()
            result.set_variables(VarX, VarY, VarZ)
            result.extra_info = ' DoF {}'.format(DoF)
            return result

        G = self.G_value(PrXYcZ, PrXcZ, PrYcZ, PrZ)
        p = chi2.cdf(G, DoF)

        independent = None
        if p < self.significance:
            independent = True
        else:
            independent = False

        result.end_timing()
        result.index = self.ci_test_counter + 1
        result.set_independent(independent, self.significance)
        result.set_variables(VarX, VarY, VarZ)
        result.set_statistic('G', G, dict())
        result.set_distribution('chi2', p, {'DoF': DoF})

        result.extra_info = ' DoF {}'.format(DoF)

        return result
Exemple #12
0
def calculate_pmf_for_mi(X: Variable, Y: Variable) -> tuple[PMF, PMF, PMF]:
    PrXY = PMF(JointVariables(X, Y))
    PrX = PMF(X)
    PrY = PMF(Y)

    return (PrXY, PrX, PrY)
def test_conditional_pmf__from_bayesian_network():
    configuration = dict()
    configuration['sourcepath'] = testutil.bif_folder / 'survey.bif'
    configuration['sample_count'] = int(4e4)
    # Using a random seed of 42 somehow requires 2e6 samples to pass, but
    # with the seed 1984, it is sufficient to generate only 4e4. Maybe the
    # random generator is biased somehow?
    configuration['random_seed'] = 1984
    configuration['values_as_indices'] = False
    configuration['objectives'] = ['R', 'TRN']

    bayesian_network = BayesianNetwork.from_bif_file(configuration['sourcepath'], use_cache=False)
    bayesian_network.finalize()

    sbnds = SampledBayesianNetworkDatasetSource(configuration)
    sbnds.reset_random_seed = True
    datasetmatrix = sbnds.create_dataset_matrix('test_sbnds')

    assert ['AGE', 'EDU', 'OCC', 'SEX'] == datasetmatrix.column_labels_X
    assert ['R', 'TRN'] == datasetmatrix.column_labels_Y

    AGE = Variable(datasetmatrix.get_column_by_label('X', 'AGE'))
    PrAge = PMF(AGE)

    SEX = Variable(datasetmatrix.get_column_by_label('X', 'SEX'))
    PrSex = PMF(SEX)

    assert_PMF_AlmostEquals_BNProbDist(
        bayesian_network.variable_nodes['AGE'].probdist,
        PrAge)

    assert_PMF_AlmostEquals_BNProbDist(
        bayesian_network.variable_nodes['SEX'].probdist,
        PrSex)

    EDU = Variable(datasetmatrix.get_column_by_label('X', 'EDU'))
    PrEdu = CPMF(EDU, given=JointVariables(AGE, SEX))

    assert_CPMF_AlmostEquals_BNProbDist(
        bayesian_network.variable_nodes['EDU'].probdist,
        PrEdu)

    OCC = Variable(datasetmatrix.get_column_by_label('X', 'OCC'))
    PrOcc = CPMF(OCC, given=EDU)

    assert_CPMF_AlmostEquals_BNProbDist(
        bayesian_network.variable_nodes['OCC'].probdist,
        PrOcc)

    R = Variable(datasetmatrix.get_column_by_label('Y', 'R'))
    PrR = CPMF(R, given=EDU)

    assert_CPMF_AlmostEquals_BNProbDist(
        bayesian_network.variable_nodes['R'].probdist,
        PrR)

    TRN = Variable(datasetmatrix.get_column_by_label('Y', 'TRN'))
    PrTRN = CPMF(TRN, given=JointVariables(OCC, R))

    assert_CPMF_AlmostEquals_BNProbDist(
        bayesian_network.variable_nodes['TRN'].probdist,
        PrTRN)