def test_add_cpds(): spbn = SemiparametricBN([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')], [('d', pbn.CKDEType())]) assert spbn.node_type('a') == pbn.UnknownFactorType() spbn.add_cpds([CKDE('a', [])]) assert spbn.node_type('a') == pbn.CKDEType() with pytest.raises(ValueError) as ex: spbn.add_cpds([LinearGaussianCPD('d', ['a', 'b', 'c'])]) assert "Bayesian network expects type" in str(ex.value) lg = LinearGaussianCPD('b', ['a'], [2.5, 1.65], 4) ckde = CKDE('d', ['a', 'b', 'c']) assert lg.fitted() assert not ckde.fitted() spbn.add_cpds([lg, ckde]) spbn.set_node_type('a', pbn.UnknownFactorType()) with pytest.raises(ValueError) as ex: not spbn.cpd('a').fitted() assert "CPD of variable \"a\" not added. Call add_cpds() or fit() to add the CPD." in str( ex.value) assert spbn.cpd('b').fitted() with pytest.raises(ValueError) as ex: not spbn.cpd('c').fitted() assert "CPD of variable \"c\" not added. Call add_cpds() or fit() to add the CPD." in str( ex.value) assert not spbn.cpd('d').fitted()
def dyn_other_fit_bytes(): variables = ["a", "b", "c", "d"] static_nodes = [v + "_t_" + str(m) for v in variables for m in range(1, 3)] transition_nodes = [v + "_t_0" for v in variables] other_static = OtherBN(static_nodes, [("a_t_2", "d_t_1")], [("b_t_2", pbn.DiscreteFactorType()), ("b_t_1", pbn.DiscreteFactorType()), ("c_t_1", pbn.CKDEType()), ("d_t_1", pbn.LinearGaussianCPDType())]) lg = LinearGaussianCPD("d_t_1", ["a_t_2"], [1, 2], 2) other_static.add_cpds([lg]) other_transition = ConditionalOtherBN( transition_nodes, static_nodes, [("a_t_2", "d_t_0")], [("b_t_0", pbn.DiscreteFactorType()), ("c_t_0", pbn.CKDEType()), ("d_t_0", pbn.LinearGaussianCPDType())]) lg = LinearGaussianCPD("d_t_0", ["a_t_2"], [3, 4], 1.5) other_transition.add_cpds([lg]) assert other_static.type() == other_transition.type() dyn_other = DynamicOtherBN(variables, 2, other_static, other_transition) df_continuous = util_test.generate_normal_data_indep(1000) df_discrete = util_test.generate_discrete_data_dependent(1000) df = df_continuous df["b"] = df_discrete["B"] dyn_other.fit(df) dyn_other.include_cpd = True return pickle.dumps(dyn_other)
def test_serialization_dbn_model(dyn_gaussian_bytes, dyn_spbn_bytes, dyn_kde_bytes, dyn_discrete_bytes, dyn_genericbn_bytes, dyn_newbn_bytes, dyn_otherbn_bytes): loaded_g = pickle.loads(dyn_gaussian_bytes) assert set(loaded_g.variables()) == set(["a", "b", "c", "d"]) assert loaded_g.static_bn().arcs() == [("a_t_2", "d_t_1")] assert loaded_g.transition_bn().arcs() == [("c_t_2", "b_t_0")] assert loaded_g.type() == pbn.GaussianNetworkType() loaded_s = pickle.loads(dyn_spbn_bytes) assert set(loaded_s.variables()) == set(["a", "b", "c", "d"]) assert loaded_s.static_bn().arcs() == [("a_t_2", "d_t_1")] assert loaded_s.transition_bn().arcs() == [("c_t_2", "b_t_0")] assert loaded_s.type() == pbn.SemiparametricBNType() node_types = { v + "_t_0": pbn.UnknownFactorType() for v in loaded_s.variables() } node_types["b_t_0"] = pbn.CKDEType() assert loaded_s.transition_bn().node_types() == node_types loaded_k = pickle.loads(dyn_kde_bytes) assert set(loaded_k.variables()) == set(["a", "b", "c", "d"]) assert loaded_k.static_bn().arcs() == [("a_t_2", "d_t_1")] assert loaded_k.transition_bn().arcs() == [("c_t_2", "b_t_0")] assert loaded_k.type() == pbn.KDENetworkType() loaded_d = pickle.loads(dyn_discrete_bytes) assert set(loaded_d.variables()) == set(["a", "b", "c", "d"]) assert loaded_d.static_bn().arcs() == [("a_t_2", "d_t_1")] assert loaded_d.transition_bn().arcs() == [("c_t_2", "b_t_0")] assert loaded_d.type() == pbn.DiscreteBNType() loaded_gen = pickle.loads(dyn_genericbn_bytes) assert set(loaded_gen.variables()) == set(["a", "b", "c", "d"]) assert loaded_gen.static_bn().arcs() == [("a_t_2", "d_t_1")] assert loaded_gen.transition_bn().arcs() == [("a_t_2", "b_t_0")] assert loaded_gen.type() == MyRestrictedGaussianNetworkType() loaded_nn = pickle.loads(dyn_newbn_bytes) assert set(loaded_nn.variables()) == set(["a", "b", "c", "d"]) assert loaded_nn.static_bn().arcs() == [("a_t_2", "d_t_1")] assert loaded_nn.transition_bn().arcs() == [("a_t_2", "b_t_0")] assert loaded_nn.type() == MyRestrictedGaussianNetworkType() loaded_other = pickle.loads(dyn_otherbn_bytes) assert set(loaded_other.variables()) == set(["a", "b", "c", "d"]) assert loaded_other.static_bn().arcs() == [("a_t_2", "d_t_1")] assert loaded_other.transition_bn().arcs() == [("a_t_2", "b_t_0")] assert loaded_other.type() == NonHomogeneousType() assert loaded_other.extra_info == "extra" assert loaded_other.static_bn().node_type( "c_t_1") == pbn.DiscreteFactorType() assert loaded_other.static_bn().node_type("d_t_1") == pbn.CKDEType() assert loaded_other.transition_bn().node_type("d_t_0") == pbn.CKDEType()
def dyn_otherbn_bytes(): other = DynamicOtherBN(["a", "b", "c", "d"], 2) other.static_bn().add_arc("a_t_2", "d_t_1") other.static_bn().set_node_type("c_t_1", pbn.DiscreteFactorType()) other.static_bn().set_node_type("d_t_1", pbn.CKDEType()) other.transition_bn().add_arc("a_t_2", "b_t_0") other.transition_bn().set_node_type("d_t_0", pbn.CKDEType()) return pickle.dumps(other)
def test_holdout_local_score_null_spbn(): spbn = pbn.SemiparametricBN([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')], [('a', pbn.CKDEType()), ('c', pbn.CKDEType())]) np.random.seed(0) a_null = np.random.randint(0, SIZE, size=100) b_null = np.random.randint(0, SIZE, size=100) c_null = np.random.randint(0, SIZE, size=100) d_null = np.random.randint(0, SIZE, size=100) df_null = df.copy() df_null.loc[df_null.index[a_null], 'a'] = np.nan df_null.loc[df_null.index[b_null], 'b'] = np.nan df_null.loc[df_null.index[c_null], 'c'] = np.nan df_null.loc[df_null.index[d_null], 'd'] = np.nan hl = pbn.HoldoutLikelihood(df_null, 0.2, seed) assert np.isclose( hl.local_score(spbn, 'a', []), numpy_local_score(pbn.CKDEType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'a', [])) assert np.isclose( hl.local_score(spbn, 'b', ['a']), numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'b', ['a'])) assert np.isclose( hl.local_score(spbn, 'c', ['a', 'b']), numpy_local_score(pbn.CKDEType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'c', ['a', 'b'])) assert np.isclose( hl.local_score(spbn, 'd', ['a', 'b', 'c']), numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'd', ['a', 'b', 'c'])) assert np.isclose( hl.local_score(spbn, 'd', ['a', 'b', 'c']), numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'd', ['b', 'c', 'a'])) assert hl.local_score(spbn, 'a') == hl.local_score(spbn, 'a', spbn.parents('a')) assert hl.local_score(spbn, 'b') == hl.local_score(spbn, 'b', spbn.parents('b')) assert hl.local_score(spbn, 'c') == hl.local_score(spbn, 'c', spbn.parents('c')) assert hl.local_score(spbn, 'd') == hl.local_score(spbn, 'd', spbn.parents('d'))
def test_serialization_bn_model(gaussian_bytes, spbn_bytes, kde_bytes, discrete_bytes, genericbn_bytes, newbn_bytes, otherbn_bytes): loaded_g = pickle.loads(gaussian_bytes) assert set(loaded_g.nodes()) == set(["a", "b", "c", "d"]) assert loaded_g.arcs() == [("a", "b")] assert loaded_g.type() == pbn.GaussianNetworkType() loaded_s = pickle.loads(spbn_bytes) assert set(loaded_s.nodes()) == set(["a", "b", "c", "d"]) assert loaded_s.arcs() == [("a", "b")] assert loaded_s.type() == pbn.SemiparametricBNType() assert loaded_s.node_types() == { 'a': pbn.UnknownFactorType(), 'b': pbn.CKDEType(), 'c': pbn.UnknownFactorType(), 'd': pbn.UnknownFactorType() } loaded_k = pickle.loads(kde_bytes) assert set(loaded_k.nodes()) == set(["a", "b", "c", "d"]) assert loaded_k.arcs() == [("a", "b")] assert loaded_k.type() == pbn.KDENetworkType() loaded_d = pickle.loads(discrete_bytes) assert set(loaded_d.nodes()) == set(["a", "b", "c", "d"]) assert loaded_d.arcs() == [("a", "b")] assert loaded_d.type() == pbn.DiscreteBNType() loaded_gen = pickle.loads(genericbn_bytes) assert set(loaded_gen.nodes()) == set(["a", "b", "c", "d"]) assert loaded_gen.arcs() == [("a", "b")] assert loaded_gen.type() == MyRestrictedGaussianNetworkType() loaded_nn = pickle.loads(newbn_bytes) assert set(loaded_g.nodes()) == set(["a", "b", "c", "d"]) assert loaded_nn.arcs() == [("a", "b")] assert loaded_nn.type() == MyRestrictedGaussianNetworkType() loaded_o = pickle.loads(otherbn_bytes) assert set(loaded_g.nodes()) == set(["a", "b", "c", "d"]) assert loaded_o.arcs() == [("a", "b")] assert loaded_o.type() == NonHomogeneousType() assert loaded_o.node_types() == { 'a': pbn.UnknownFactorType(), 'b': pbn.LinearGaussianCPDType(), 'c': pbn.CKDEType(), 'd': pbn.DiscreteFactorType() } assert loaded_o.extra_info == "extra" assert loaded_nn.type() != loaded_o.type()
def test_cvl_local_score_spbn(): spbn = pbn.SemiparametricBN([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')], [('a', pbn.CKDEType()), ('c', pbn.CKDEType())]) cvl = pbn.CVLikelihood(df, 10, seed) assert np.isclose(cvl.local_score(spbn, 'a', []), numpy_local_score(pbn.CKDEType(), df, 'a', [])) assert np.isclose( cvl.local_score(spbn, 'b', ['a']), numpy_local_score(pbn.LinearGaussianCPDType(), df, 'b', ['a'])) assert np.isclose(cvl.local_score(spbn, 'c', ['a', 'b']), numpy_local_score(pbn.CKDEType(), df, 'c', ['a', 'b'])) assert np.isclose( cvl.local_score(spbn, 'd', ['a', 'b', 'c']), numpy_local_score(pbn.LinearGaussianCPDType(), df, 'd', ['a', 'b', 'c'])) assert np.isclose( cvl.local_score(spbn, 'd', ['a', 'b', 'c']), numpy_local_score(pbn.LinearGaussianCPDType(), df, 'd', ['b', 'c', 'a'])) assert cvl.local_score(spbn, 'a') == cvl.local_score(spbn, 'a', spbn.parents('a')) assert cvl.local_score(spbn, 'b') == cvl.local_score(spbn, 'b', spbn.parents('b')) assert cvl.local_score(spbn, 'c') == cvl.local_score(spbn, 'c', spbn.parents('c')) assert cvl.local_score(spbn, 'd') == cvl.local_score(spbn, 'd', spbn.parents('d')) assert np.isclose( cvl.local_score_node_type(spbn, pbn.LinearGaussianCPDType(), 'a', []), numpy_local_score(pbn.LinearGaussianCPDType(), df, 'a', [])) assert np.isclose( cvl.local_score_node_type(spbn, pbn.CKDEType(), 'b', ['a']), numpy_local_score(pbn.CKDEType(), df, 'b', ['a'])) assert np.isclose( cvl.local_score_node_type(spbn, pbn.LinearGaussianCPDType(), 'c', ['a', 'b']), numpy_local_score(pbn.LinearGaussianCPDType(), df, 'c', ['a', 'b'])) assert np.isclose( cvl.local_score_node_type(spbn, pbn.CKDEType(), 'd', ['a', 'b', 'c']), numpy_local_score(pbn.CKDEType(), df, 'd', ['a', 'b', 'c'])) assert np.isclose( cvl.local_score_node_type(spbn, pbn.CKDEType(), 'd', ['a', 'b', 'c']), numpy_local_score(pbn.CKDEType(), df, 'd', ['b', 'c', 'a']))
def test_node_type(): spbn = SemiparametricBN(['a', 'b', 'c', 'd']) assert spbn.num_nodes() == 4 assert spbn.num_arcs() == 0 assert spbn.nodes() == ['a', 'b', 'c', 'd'] for n in spbn.nodes(): assert spbn.node_type(n) == pbn.UnknownFactorType() spbn.set_node_type('b', pbn.CKDEType()) assert spbn.node_type('b') == pbn.CKDEType() spbn.set_node_type('b', pbn.LinearGaussianCPDType()) assert spbn.node_type('b') == pbn.LinearGaussianCPDType()
def test_apply(): gbn = pbn.GaussianNetwork(['a', 'b', 'c', 'd']) assert gbn.num_arcs() == 0 assert not gbn.has_arc('a', 'b') o = pbn.AddArc("a", "b", 1) o.apply(gbn) assert gbn.num_arcs() == 1 assert gbn.has_arc('a', 'b') o = pbn.FlipArc("a", "b", 1) o.apply(gbn) assert gbn.num_arcs() == 1 assert not gbn.has_arc('a', 'b') assert gbn.has_arc('b', 'a') o = pbn.RemoveArc("b", "a", 1) o.apply(gbn) assert gbn.num_arcs() == 0 assert not gbn.has_arc('b', 'a') o = pbn.ChangeNodeType("a", pbn.CKDEType(), 1) with pytest.raises(ValueError) as ex: o.apply(gbn) assert "Wrong factor type" in str(ex.value) spbn = pbn.SemiparametricBN(['a', 'b', 'c', 'd']) assert spbn.num_arcs() == 0 o = pbn.ChangeNodeType("a", pbn.CKDEType(), 1) assert (spbn.node_type('a') == pbn.UnknownFactorType()) o.apply(spbn) assert (spbn.node_type('a') == pbn.CKDEType()) assert not spbn.has_arc('a', 'b') o = pbn.AddArc("a", "b", 1) o.apply(spbn) assert spbn.num_arcs() == 1 assert spbn.has_arc('a', 'b') o = pbn.FlipArc("a", "b", 1) o.apply(spbn) assert spbn.num_arcs() == 1 assert not spbn.has_arc('a', 'b') assert spbn.has_arc('b', 'a') o = pbn.RemoveArc("b", "a", 1) o.apply(spbn) assert spbn.num_arcs() == 0 assert not spbn.has_arc('b', 'a')
def test_factor_type(): lg1 = pbn.LinearGaussianCPD("a", []) lg2 = pbn.LinearGaussianCPD("b", ["a"]) lg3 = pbn.LinearGaussianCPD("c", ["b", "a"]) assert lg1.type() == pbn.LinearGaussianCPDType() assert lg1.type() == lg2.type() assert lg1.type() == lg3.type() assert lg2.type() == lg3.type() c1 = pbn.CKDE("a", []) c2 = pbn.CKDE("b", ["a"]) c3 = pbn.CKDE("c", ["b", "a"]) assert c1.type() == pbn.CKDEType() assert c1.type() == c2.type() assert c1.type() == c3.type() assert c2.type() == c3.type() d1 = pbn.DiscreteFactor("a", []) d2 = pbn.DiscreteFactor("b", ["a"]) d3 = pbn.DiscreteFactor("c", ["b", "a"]) assert d1.type() == pbn.DiscreteFactorType() assert d1.type() == d2.type() assert d1.type() == d3.type() assert d2.type() == d3.type() assert lg1.type() != c1.type() assert lg1.type() != d1.type() assert c1.type() != d1.type()
def test_serialization_factor_type(lg_type_bytes, ckde_type_bytes, discrete_type_bytes, new_type_bytes, other_type_bytes): loaded_lg = pickle.loads(lg_type_bytes) new_lg = pbn.LinearGaussianCPDType() assert new_lg == loaded_lg loaded_ckde = pickle.loads(ckde_type_bytes) new_ckde = pbn.CKDEType() assert loaded_ckde == new_ckde loaded_discrete = pickle.loads(discrete_type_bytes) new_discrete = pbn.DiscreteFactorType() assert loaded_discrete == new_discrete loaded_new = pickle.loads(new_type_bytes) new_new = NewType() assert loaded_new == new_new loaded_other = pickle.loads(other_type_bytes) new_other = OtherType() assert loaded_other == new_other assert new_lg != new_ckde assert new_lg != new_discrete assert new_lg != new_new assert new_lg != new_other assert new_ckde != new_discrete assert new_ckde != new_new assert new_ckde != new_other assert new_discrete != new_new assert new_discrete != new_other assert new_new != new_other
def test_opposite(): bn = pbn.SemiparametricBN(["a", "b"]) o = pbn.AddArc("a", "b", 1) oppo = o.opposite(bn) assert oppo.source() == 'a' assert oppo.target() == 'b' assert oppo.delta() == -1 assert type(oppo) == pbn.RemoveArc o = pbn.RemoveArc("a", "b", 1) oppo = o.opposite(bn) assert oppo.source() == 'a' assert oppo.target() == 'b' assert oppo.delta() == -1 assert type(oppo) == pbn.AddArc o = pbn.FlipArc("a", "b", 1) oppo = o.opposite(bn) assert oppo.source() == 'b' assert oppo.target() == 'a' assert oppo.delta() == -1 assert type(oppo) == pbn.FlipArc bn.set_node_type("a", pbn.LinearGaussianCPDType()) o = pbn.ChangeNodeType("a", pbn.CKDEType(), 1) oppo = o.opposite(bn) assert oppo.node() == 'a' assert oppo.node_type() == pbn.LinearGaussianCPDType() assert oppo.delta() == -1 assert type(oppo) == pbn.ChangeNodeType
def cond_other_partial_fit_bytes(): other = ConditionalOtherBN(["c", "d"], ["a", "b"], [("a", "c")], [("c", pbn.CKDEType()), ("d", pbn.LinearGaussianCPDType())]) lg = LinearGaussianCPD("d", [], [3], 1.5) other.add_cpds([lg]) other.include_cpd = True return pickle.dumps(other)
def other_partial_fit_bytes(): other = OtherBN(["a", "b", "c", "d"], [("a", "b")], [("b", pbn.LinearGaussianCPDType()), ("c", pbn.CKDEType()), ("d", pbn.DiscreteFactorType())]) lg = LinearGaussianCPD("b", ["a"], [1, 2], 2) other.add_cpds([lg]) other.include_cpd = True return pickle.dumps(other)
def test_cpd(): spbn = SemiparametricBN([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')], [('d', pbn.CKDEType())]) with pytest.raises(ValueError) as ex: spbn.cpd('a') assert "not added" in str(ex.value) spbn.fit(df) assert spbn.cpd('a').type() == pbn.LinearGaussianCPDType() assert spbn.cpd('b').type() == pbn.LinearGaussianCPDType() assert spbn.cpd('c').type() == pbn.LinearGaussianCPDType() assert spbn.cpd('d').type() == pbn.CKDEType() assert spbn.cpd('a').fitted() assert spbn.cpd('b').fitted() assert spbn.cpd('c').fitted() assert spbn.cpd('d').fitted()
def test_holdout_score(): gbn = pbn.GaussianNetwork([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) hl = pbn.HoldoutLikelihood(df, 0.2, 0) assert np.isclose( hl.score(gbn), (hl.local_score(gbn, 'a', []) + hl.local_score(gbn, 'b', ['a']) + hl.local_score(gbn, 'c', ['a', 'b']) + hl.local_score(gbn, 'd', ['a', 'b', 'c']))) spbn = pbn.SemiparametricBN([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')], [('a', pbn.CKDEType()), ('c', pbn.CKDEType())]) assert np.isclose(hl.score(spbn), (hl.local_score(spbn, 'a') + hl.local_score(spbn, 'b') + hl.local_score(spbn, 'c') + hl.local_score(spbn, 'd')))
def test_cvl_score(): gbn = pbn.GaussianNetwork([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) cv = pbn.CVLikelihood(df, 10, 0) assert np.isclose( cv.score(gbn), (cv.local_score(gbn, 'a', []) + cv.local_score(gbn, 'b', ['a']) + cv.local_score(gbn, 'c', ['a', 'b']) + cv.local_score(gbn, 'd', ['a', 'b', 'c']))) spbn = pbn.SemiparametricBN([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')], [('a', pbn.CKDEType()), ('c', pbn.CKDEType())]) assert np.isclose(cv.score(spbn), (cv.local_score(spbn, 'a') + cv.local_score(spbn, 'b') + cv.local_score(spbn, 'c') + cv.local_score(spbn, 'd')))
def test_create(): o = pbn.AddArc("a", "b", 1) assert o.source() == 'a' assert o.target() == 'b' assert o.delta() == 1 o = pbn.RemoveArc("a", "b", 2) assert o.source() == 'a' assert o.target() == 'b' assert o.delta() == 2 o = pbn.FlipArc("a", "b", 3) assert o.source() == 'a' assert o.target() == 'b' assert o.delta() == 3 o = pbn.ChangeNodeType("a", pbn.CKDEType(), 4) assert o.node() == 'a' assert o.node_type() == pbn.CKDEType() assert o.delta() == 4
def test_holdout_local_score_spbn(): spbn = pbn.SemiparametricBN([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')], [('a', pbn.CKDEType()), ('c', pbn.CKDEType())]) hl = pbn.HoldoutLikelihood(df, 0.2, seed) assert np.isclose( hl.local_score(spbn, 'a', []), numpy_local_score(pbn.CKDEType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'a', [])) assert np.isclose( hl.local_score(spbn, 'b', ['a']), numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'b', ['a'])) assert np.isclose( hl.local_score(spbn, 'c', ['a', 'b']), numpy_local_score(pbn.CKDEType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'c', ['a', 'b'])) assert np.isclose( hl.local_score(spbn, 'd', ['a', 'b', 'c']), numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'd', ['a', 'b', 'c'])) assert np.isclose( hl.local_score(spbn, 'd', ['a', 'b', 'c']), numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'd', ['b', 'c', 'a'])) assert hl.local_score(spbn, 'a') == hl.local_score(spbn, 'a', spbn.parents('a')) assert hl.local_score(spbn, 'b') == hl.local_score(spbn, 'b', spbn.parents('b')) assert hl.local_score(spbn, 'c') == hl.local_score(spbn, 'c', spbn.parents('c')) assert hl.local_score(spbn, 'd') == hl.local_score(spbn, 'd', spbn.parents('d'))
def numpy_local_score(node_type, data, variable, evidence): cv = pbn.CrossValidation(data, 10, seed) loglik = 0 for train_df, test_df in cv: if isinstance(variable, str): node_data = train_df.to_pandas().loc[:, [variable] + evidence].dropna() variable_data = node_data.loc[:, variable] evidence_data = node_data.loc[:, evidence] test_node_data = test_df.to_pandas().loc[:, [variable] + evidence].dropna() test_variable_data = test_node_data.loc[:, variable] test_evidence_data = test_node_data.loc[:, evidence] else: node_data = train_df.to_pandas().iloc[:, [variable] + evidence].dropna() variable_data = node_data.iloc[:, 0] evidence_data = node_data.iloc[:, 1:] test_node_data = test_df.to_pandas().iloc[:, [variable] + evidence].dropna() test_variable_data = test_node_data.iloc[:, 0] test_evidence_data = test_node_data.iloc[:, 1:] if node_type == pbn.LinearGaussianCPDType(): N = variable_data.shape[0] d = evidence_data.shape[1] linregress_data = np.column_stack( (np.ones(N), evidence_data.to_numpy())) (beta, res, _, _) = np.linalg.lstsq(linregress_data, variable_data.to_numpy(), rcond=None) var = res / (N - d - 1) means = beta[0] + np.sum(beta[1:] * test_evidence_data, axis=1) loglik += norm.logpdf(test_variable_data, means, np.sqrt(var)).sum() elif node_type == pbn.CKDEType(): k_joint = gaussian_kde( node_data.to_numpy().T, bw_method=lambda s: np.power(4 / (s.d + 2), 1 / (s.d + 4)) * s.scotts_factor()) if evidence: k_marg = gaussian_kde(evidence_data.to_numpy().T, bw_method=k_joint.covariance_factor()) loglik += np.sum( k_joint.logpdf(test_node_data.to_numpy().T) - k_marg.logpdf(test_evidence_data.to_numpy().T)) else: loglik += np.sum(k_joint.logpdf(test_node_data.to_numpy().T)) return loglik
def test_serialization_unfitted_factor(lg_bytes, ckde_bytes, discrete_bytes, new_bytes, newbis_bytes): loaded_lg = pickle.loads(lg_bytes) assert loaded_lg.variable() == "c" assert set(loaded_lg.evidence()) == set(["a", "b"]) assert not loaded_lg.fitted() assert loaded_lg.type() == pbn.LinearGaussianCPDType() loaded_ckde = pickle.loads(ckde_bytes) assert loaded_ckde.variable() == "c" assert set(loaded_ckde.evidence()) == set(["a", "b"]) assert not loaded_ckde.fitted() assert loaded_ckde.type() == pbn.CKDEType() loaded_discrete = pickle.loads(discrete_bytes) assert loaded_discrete.variable() == "c" assert set(loaded_discrete.evidence()) == set(["a", "b"]) assert not loaded_discrete.fitted() assert loaded_discrete.type() == pbn.DiscreteFactorType() loaded_new = pickle.loads(new_bytes) assert loaded_new.variable() == "c" assert set(loaded_new.evidence()) == set(["a", "b"]) assert not loaded_new.fitted() assert type(loaded_new.type()) == NewType nn = NewFactor("a", []) assert loaded_new.type() == nn.type() from pybnesian import GaussianNetwork dummy_network = GaussianNetwork(["a", "b", "c", "d"]) assert type(loaded_new.type().new_factor(dummy_network, "a", [])) == NewFactor loaded_newbis = pickle.loads(newbis_bytes) assert loaded_newbis.variable() == "c" assert set(loaded_newbis.evidence()) == set(["a", "b"]) assert not loaded_newbis.fitted() assert type(loaded_newbis.type()) == NewType nnbis = NewFactorBis("a", []) assert loaded_newbis.type() == nnbis.type() assert type(loaded_newbis.type().new_factor(dummy_network, "a", [])) == NewFactorBis assert loaded_lg.type() != loaded_ckde.type() assert loaded_lg.type() != loaded_discrete.type() assert loaded_lg.type() != loaded_new.type() assert loaded_ckde.type() != loaded_discrete.type() assert loaded_ckde.type() != loaded_new.type() assert loaded_discrete.type() != loaded_new.type() assert loaded_newbis.type() == loaded_new.type()
def dyn_other_partial_fit_bytes(): variables = ["a", "b", "c", "d"] static_nodes = [v + "_t_" + str(m) for v in variables for m in range(1, 3)] transition_nodes = [v + "_t_0" for v in variables] other_static = OtherBN(static_nodes, [("a_t_2", "d_t_1")], [("b_t_1", pbn.DiscreteFactorType()), ("c_t_1", pbn.CKDEType()), ("d_t_1", pbn.LinearGaussianCPDType())]) lg = LinearGaussianCPD("d_t_1", ["a_t_2"], [1, 2], 2) other_static.add_cpds([lg]) other_transition = ConditionalOtherBN( transition_nodes, static_nodes, [("a_t_2", "d_t_0")], [("b_t_0", pbn.DiscreteFactorType()), ("c_t_0", pbn.CKDEType()), ("d_t_0", pbn.LinearGaussianCPDType())]) lg = LinearGaussianCPD("d_t_0", ["a_t_2"], [3, 4], 1.5) other_transition.add_cpds([lg]) assert other_static.type() == other_transition.type() dyn_other = DynamicOtherBN(variables, 2, other_static, other_transition) dyn_other.include_cpd = True return pickle.dumps(dyn_other)
def cond_other_fit_bytes(): other = ConditionalOtherBN(["c", "d"], ["a", "b"], [("a", "c")], [("c", pbn.CKDEType()), ("d", pbn.DiscreteFactorType())]) cpd_c = CKDE("c", ["a"]) cpd_d = DiscreteFactor("d", []) df_continuous = util_test.generate_normal_data_indep(100) cpd_c.fit(df_continuous) df_discrete = util_test.generate_discrete_data_dependent(100) df_discrete.columns = df_discrete.columns.str.lower() cpd_d = DiscreteFactor("d", []) cpd_d.fit(df_discrete) other.add_cpds([cpd_c, cpd_d]) other.include_cpd = True return pickle.dumps(other)
def other_fit_bytes(): other = OtherBN(["a", "b", "c", "d"], [("a", "b")], [("b", pbn.LinearGaussianCPDType()), ("c", pbn.CKDEType()), ("d", pbn.DiscreteFactorType())]) cpd_a = LinearGaussianCPD("a", [], [0], 0.5) cpd_b = LinearGaussianCPD("b", ["a"], [1, 2], 2) df_continuous = util_test.generate_normal_data_indep(100) cpd_c = CKDE("c", []) cpd_c.fit(df_continuous) df_discrete = util_test.generate_discrete_data_dependent(100) df_discrete.columns = df_discrete.columns.str.lower() cpd_d = DiscreteFactor("d", []) cpd_d.fit(df_discrete) other.add_cpds([cpd_a, cpd_b, cpd_c, cpd_d]) other.include_cpd = True return pickle.dumps(other)
def test_fit(): spbn = SemiparametricBN([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) with pytest.raises(ValueError) as ex: for n in spbn.nodes(): cpd = spbn.cpd(n) assert "not added" in str(ex.value) spbn.fit(df) for n in spbn.nodes(): cpd = spbn.cpd(n) assert cpd.type() == pbn.LinearGaussianCPDType() assert type(cpd) == pbn.LinearGaussianCPD assert cpd.variable() == n assert set(cpd.evidence()) == set(spbn.parents(n)) spbn.fit(df) spbn.remove_arc('a', 'b') cpd_b = spbn.cpd('b') assert type(cpd_b) == pbn.LinearGaussianCPD assert cpd_b.evidence != spbn.parents('b') spbn.fit(df) cpd_b = spbn.cpd('b') assert type(cpd_b) == pbn.LinearGaussianCPD assert cpd_b.evidence() == spbn.parents('b') spbn.set_node_type('c', pbn.CKDEType()) with pytest.raises(ValueError) as ex: cpd_c = spbn.cpd('c') assert "not added" in str(ex.value) spbn.fit(df) cpd_c = spbn.cpd('c') assert cpd_c.type() == spbn.node_type('c')
def test_mle_create(): with pytest.raises(ValueError) as ex: mle = pbn.MLE(pbn.CKDEType()) assert "MLE not available" in str(ex.value) mle = pbn.MLE(pbn.LinearGaussianCPDType())
def dyn_spbn_bytes(): spbn = pbn.DynamicSemiparametricBN(["a", "b", "c", "d"], 2) spbn.static_bn().add_arc("a_t_2", "d_t_1") spbn.transition_bn().add_arc("c_t_2", "b_t_0") spbn.transition_bn().set_node_type("b_t_0", pbn.CKDEType()) return pickle.dumps(spbn)
def test_serialization_fitted_dbn(dyn_gaussian_partial_fit_bytes, dyn_gaussian_fit_bytes, dyn_other_partial_fit_bytes, dyn_other_fit_bytes): # #################### # Gaussian partial fit # #################### loaded_partial = pickle.loads(dyn_gaussian_partial_fit_bytes) assert not loaded_partial.fitted() assert not loaded_partial.static_bn().fitted() assert not loaded_partial.transition_bn().fitted() cpd = loaded_partial.static_bn().cpd("d_t_1") assert cpd.variable() == "d_t_1" assert cpd.evidence() == ["a_t_2"] assert list(cpd.beta) == [1, 2] assert cpd.variance == 2 cpd = loaded_partial.transition_bn().cpd("b_t_0") assert cpd.variable() == "b_t_0" assert cpd.evidence() == ["c_t_2"] assert list(cpd.beta) == [3, 4] assert cpd.variance == 5 # #################### # Gaussian fit # #################### loaded_fitted = pickle.loads(dyn_gaussian_fit_bytes) assert loaded_fitted.fitted() assert loaded_fitted.static_bn().fitted() assert loaded_fitted.transition_bn().fitted() # #################### # Other partial fit # #################### loaded_partial = pickle.loads(dyn_other_partial_fit_bytes) assert not loaded_partial.fitted() assert not loaded_partial.static_bn().fitted() assert not loaded_partial.transition_bn().fitted() assert loaded_partial.static_bn().node_type( "b_t_1") == pbn.DiscreteFactorType() assert loaded_partial.static_bn().node_type("c_t_1") == pbn.CKDEType() assert loaded_partial.static_bn().node_type( "d_t_1") == pbn.LinearGaussianCPDType() assert loaded_partial.transition_bn().node_type( "b_t_0") == pbn.DiscreteFactorType() assert loaded_partial.transition_bn().node_type("c_t_0") == pbn.CKDEType() assert loaded_partial.transition_bn().node_type( "d_t_0") == pbn.LinearGaussianCPDType() cpd = loaded_partial.static_bn().cpd("d_t_1") assert cpd.variable() == "d_t_1" assert cpd.evidence() == ["a_t_2"] assert list(cpd.beta) == [1, 2] assert cpd.variance == 2 cpd = loaded_partial.transition_bn().cpd("d_t_0") assert cpd.variable() == "d_t_0" assert cpd.evidence() == ["a_t_2"] assert list(cpd.beta) == [3, 4] assert cpd.variance == 1.5 # #################### # Other fit # #################### loaded_fitted = pickle.loads(dyn_other_fit_bytes) assert loaded_fitted.fitted() assert loaded_fitted.static_bn().fitted() assert loaded_fitted.transition_bn().fitted() assert loaded_partial.static_bn().node_type( "b_t_1") == pbn.DiscreteFactorType() assert loaded_partial.static_bn().node_type("c_t_1") == pbn.CKDEType() assert loaded_partial.static_bn().node_type( "d_t_1") == pbn.LinearGaussianCPDType() assert loaded_partial.transition_bn().node_type( "b_t_0") == pbn.DiscreteFactorType() assert loaded_partial.transition_bn().node_type("c_t_0") == pbn.CKDEType() assert loaded_partial.transition_bn().node_type( "d_t_0") == pbn.LinearGaussianCPDType() cpd = loaded_partial.static_bn().cpd("d_t_1") assert cpd.variable() == "d_t_1" assert cpd.evidence() == ["a_t_2"] assert list(cpd.beta) == [1, 2] assert cpd.variance == 2 cpd = loaded_partial.transition_bn().cpd("d_t_0") assert cpd.variable() == "d_t_0" assert cpd.evidence() == ["a_t_2"] assert list(cpd.beta) == [3, 4] assert cpd.variance == 1.5
def ckde_type_bytes(): ckde = pbn.CKDEType() return pickle.dumps(ckde)
def test_cvl_local_score_null_spbn(): spbn = pbn.SemiparametricBN([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')], [('a', pbn.CKDEType()), ('c', pbn.CKDEType())]) np.random.seed(0) a_null = np.random.randint(0, SIZE, size=100) b_null = np.random.randint(0, SIZE, size=100) c_null = np.random.randint(0, SIZE, size=100) d_null = np.random.randint(0, SIZE, size=100) df_null = df.copy() df_null.loc[df_null.index[a_null], 'a'] = np.nan df_null.loc[df_null.index[b_null], 'b'] = np.nan df_null.loc[df_null.index[c_null], 'c'] = np.nan df_null.loc[df_null.index[d_null], 'd'] = np.nan cvl = pbn.CVLikelihood(df_null, 10, seed) assert np.isclose(cvl.local_score(spbn, 'a', []), numpy_local_score(pbn.CKDEType(), df_null, 'a', [])) assert np.isclose( cvl.local_score(spbn, 'b', ['a']), numpy_local_score(pbn.LinearGaussianCPDType(), df_null, 'b', ['a'])) assert np.isclose( cvl.local_score(spbn, 'c', ['a', 'b']), numpy_local_score(pbn.CKDEType(), df_null, 'c', ['a', 'b'])) assert np.isclose( cvl.local_score(spbn, 'd', ['a', 'b', 'c']), numpy_local_score(pbn.LinearGaussianCPDType(), df_null, 'd', ['a', 'b', 'c'])) assert np.isclose( cvl.local_score(spbn, 'd', ['a', 'b', 'c']), numpy_local_score(pbn.LinearGaussianCPDType(), df_null, 'd', ['b', 'c', 'a'])) assert cvl.local_score(spbn, 'a') == cvl.local_score(spbn, 'a', spbn.parents('a')) assert cvl.local_score(spbn, 'b') == cvl.local_score(spbn, 'b', spbn.parents('b')) assert cvl.local_score(spbn, 'c') == cvl.local_score(spbn, 'c', spbn.parents('c')) assert cvl.local_score(spbn, 'd') == cvl.local_score(spbn, 'd', spbn.parents('d')) assert np.isclose( cvl.local_score_node_type(spbn, pbn.LinearGaussianCPDType(), 'a', []), numpy_local_score(pbn.LinearGaussianCPDType(), df_null, 'a', [])) assert np.isclose( cvl.local_score_node_type(spbn, pbn.CKDEType(), 'b', ['a']), numpy_local_score(pbn.CKDEType(), df_null, 'b', ['a'])) assert np.isclose( cvl.local_score_node_type(spbn, pbn.LinearGaussianCPDType(), 'c', ['a', 'b']), numpy_local_score(pbn.LinearGaussianCPDType(), df_null, 'c', ['a', 'b'])) assert np.isclose( cvl.local_score_node_type(spbn, pbn.CKDEType(), 'd', ['a', 'b', 'c']), numpy_local_score(pbn.CKDEType(), df_null, 'd', ['a', 'b', 'c'])) assert np.isclose( cvl.local_score_node_type(spbn, pbn.CKDEType(), 'd', ['a', 'b', 'c']), numpy_local_score(pbn.CKDEType(), df_null, 'd', ['b', 'c', 'a']))