Beispiel #1
0
    def test_tabu_edges_on_non_existing_edges_do_nothing(self, train_data_idx):
        """If tabu edges do not exist in the original unconstrained network then nothing changes"""

        g1 = from_pandas(train_data_idx, w_threshold=0.3)
        g2 = from_pandas(train_data_idx,
                         w_threshold=0.3,
                         tabu_edges=[("a", "d"), ("e", "a")])
        assert set(g1.edges) == set(g2.edges)
Beispiel #2
0
    def test_empty_data_raises_error(self):
        """
        Providing an empty data set should result in a Value Error explaining that data must not be empty.
        This error is useful to catch and handle gracefully, because otherwise the user would experience
        misleading division by zero, or unpacking errors.
        """

        with pytest.raises(ValueError):
            from_pandas(pd.DataFrame(data=[], columns=["a"]))
Beispiel #3
0
    def test_single_iter_gets_converged_fail_warnings(self, train_data_idx):
        """
        With a single iteration on this dataset, learn_structure fails to converge and should give warnings.
        """

        with pytest.warns(
                UserWarning,
                match="Failed to converge. Consider increasing max_iter."):
            from_pandas(train_data_idx, max_iter=1)
Beispiel #4
0
 def test_array_with_nan_raises_error(self):
     """
     Providing a data set including nan should result in a Value Error explaining that data contains nan.
     This error is useful to catch and handle gracefully, because otherwise the user would have empty structures.
     """
     with pytest.raises(
             ValueError,
             match=
             "Input contains NaN, infinity or a value too large for dtype*",
     ):
         from_pandas(pd.DataFrame(data=[np.nan, 0], columns=["a"]))
Beispiel #5
0
    def test_no_cycles(self, train_data_idx):
        """
        The learned structure should be acyclic
        """

        g = from_pandas(train_data_idx, w_threshold=0.3)
        assert nx.algorithms.is_directed_acyclic_graph(g)
Beispiel #6
0
    def test_certain_relationships_get_near_certain_weight(self):
        """If observations reliably show a==b and !a==!b then the relationship from a->b should be certain"""

        data = pd.DataFrame([[0, 1] for _ in range(10)], columns=["a", "b"])
        g = from_pandas(data)
        assert all(0.99 <= weight <= 1
                   for u, v, weight in g.edges(data="weight")
                   if u == 0 and v == 1)
Beispiel #7
0
    def test_behaves_same_as_seperate_calls(self, train_data_idx, train_data_discrete):
        bn1 = BayesianNetwork(from_pandas(train_data_idx, w_threshold=0.3))
        bn2 = BayesianNetwork(from_pandas(train_data_idx, w_threshold=0.3))

        bn1.fit_node_states(train_data_discrete).fit_cpds(train_data_discrete)
        bn2.fit_node_states_and_cpds(train_data_discrete)

        assert bn1.edges == bn2.edges
        assert bn1.node_states == bn2.node_states

        cpds1 = bn1.cpds
        cpds2 = bn2.cpds

        assert cpds1.keys() == cpds2.keys()

        for k in cpds1:
            assert cpds1[k].equals(cpds2[k])
Beispiel #8
0
    def test_inverse_relationships_get_negative_weight(self):
        """If observations indicate a==!b and b==!a then the weight of the relationship from a-> should be negative"""

        data = pd.DataFrame([[0, 1] for _ in range(10)], columns=["a", "b"])
        data.append(
            pd.DataFrame([[1, 0] for _ in range(10)], columns=["a", "b"]))
        g = from_pandas(data)
        assert all(weight < 0 for u, v, weight in g.edges(data="weight")
                   if u == 0 and v == 1)
Beispiel #9
0
    def test_report_ignores_unrequired_columns_in_data(self, train_data_idx,
                                                       train_data_discrete,
                                                       test_data_c_discrete):
        """Classification report should ignore any columns that are no needed by predict"""

        bn = BayesianNetwork(
            from_pandas(train_data_idx,
                        w_threshold=0.3)).fit_node_states(train_data_discrete)
        train_data_discrete["NEW_COL"] = [1] * len(train_data_discrete)
        bn.fit_cpds(train_data_discrete)
        classification_report(bn, test_data_c_discrete, "c")
Beispiel #10
0
    def test_query_when_cpds_not_fit(self, train_data_idx, train_data_discrete):
        """An error should be raised if query before CPDs are fit"""

        bn = BayesianNetwork(
            from_pandas(train_data_idx, w_threshold=0.3)
        ).fit_node_states(train_data_discrete)

        with pytest.raises(
            ValueError, match=r"Bayesian Network does not contain any CPDs.*"
        ):
            InferenceEngine(bn)
Beispiel #11
0
    def test_multiple_tabu(self, train_data_idx):
        """Any edge related to tabu edges/parent nodes/child nodes should not exist in the network"""

        tabu_e = [("d", "a"), ("b", "c")]
        tabu_p = ["b"]
        tabu_c = ["a", "d"]
        g = from_pandas(
            train_data_idx,
            tabu_edges=tabu_e,
            tabu_parent_nodes=tabu_p,
            tabu_child_nodes=tabu_c,
        )
        assert [e not in g.edges for e in tabu_e]
        assert [p not in [e[0] for e in g.edges] for p in tabu_p]
        assert [c not in [e[1] for e in g.edges] for c in tabu_c]
Beispiel #12
0
    def test_non_numeric_data_raises_error(self):
        """Only numeric data frames should be supported"""

        with pytest.raises(ValueError,
                           match="All columns must have numeric data.*"):
            from_pandas(pd.DataFrame(data=["x"], columns=["a"]))
Beispiel #13
0
 def learn_notears(self, df, tabu_edges, thres):
     """This function is used to learn model using NOTEARS"""
     sm = from_pandas(df, tabu_edges=tabu_edges, w_threshold=thres)
     return sm, sm.edges
Beispiel #14
0
    def test_expected_structure_learned(self, train_data_idx, train_model):
        """Given a small data set that can be examined by hand, the structure should be deterministic"""

        g = from_pandas(train_data_idx, w_threshold=0.3)
        assert set(g.edges) == set(train_model.edges)
Beispiel #15
0
    def test_isolated_nodes_exist(self, train_data_idx):
        """Isolated nodes should still be in the learned structure"""

        g = from_pandas(train_data_idx, w_threshold=1.0)
        assert len(g.nodes) == len(train_data_idx.columns)
Beispiel #16
0
    def test_all_columns_in_structure(self, train_data_idx):
        """Every columns that is in the data should become a node in the learned structure"""

        g = from_pandas(train_data_idx)
        assert len(g.nodes) == len(train_data_idx.columns)
Beispiel #17
0
    def test_sparsity_against_without_reg(self, train_data_idx):
        """Structure learnt from regularisation should be sparser than the one without"""

        g1 = from_pandas_lasso(train_data_idx, 0.1, w_threshold=0.3)
        g2 = from_pandas(train_data_idx, w_threshold=0.3)
        assert len(g1.edges) > len(g2.edges)
Beispiel #18
0
def bn(train_data_idx, train_data_discrete) -> BayesianNetwork:
    return BayesianNetwork(
        from_pandas(train_data_idx, w_threshold=0.3)
    ).fit_node_states_and_cpds(train_data_discrete)
Beispiel #19
0
    def test_tabu_expected_child_nodes(self, train_data_idx):
        """Tabu child nodes should not have any ingoing edges"""

        tabu_c = ["a", "d", "b"]
        g = from_pandas(train_data_idx, tabu_child_nodes=tabu_c)
        assert [c not in [e[1] for e in g.edges] for c in tabu_c]
Beispiel #20
0
    def test_tabu_expected_parent_nodes(self, train_data_idx):
        """Tabu parent nodes should not have any outgoing edges"""

        tabu_p = ["a", "d", "b"]
        g = from_pandas(train_data_idx, tabu_parent_nodes=tabu_p)
        assert [p not in [e[0] for e in g.edges] for p in tabu_p]
Beispiel #21
0
    def test_tabu_expected_edges(self, train_data_idx):
        """Tabu edges should not exist in the network"""

        tabu_e = [("d", "a"), ("b", "c")]
        g = from_pandas(train_data_idx, tabu_edges=tabu_e)
        assert [e not in g.edges for e in tabu_e]
Beispiel #22
0
def bn(train_data_idx, train_data_discrete) -> BayesianNetwork:
    """Perform structure learning and CPD estimation"""
    return BayesianNetwork(from_pandas(
        train_data_idx,
        w_threshold=0.3)).fit_node_states_and_cpds(train_data_discrete)
# from src.utils.Clock import *
def clock(startTime, endTime):
    elapsedTime = endTime - startTime
    elapsedMins = int(elapsedTime / 60)
    elapsedSecs = int(elapsedTime - (elapsedMins * 60))
    return elapsedMins, elapsedSecs


# %% codecell
from causalnex.structure.notears import from_pandas
import time

startTime: float = time.time()

carStructLearned = from_pandas(X=labelEncData)

print(f"Time taken = {clock(startTime = startTime, endTime = time.time())}")

# %% codecell
from IPython.display import Image
from causalnex.plots import plot_structure, NODE_STYLE, EDGE_STYLE

# Now visualize it:
viz = plot_structure(carStructLearned,
                     graph_attributes={"scale": "0.5"},
                     all_node_attributes=NODE_STYLE.WEAK,
                     all_edge_attributes=EDGE_STYLE.WEAK)
filename_carLearned = curPath + "car_learnedStructure.png"

viz.draw(filename_carLearned)
G3_map = {0: "Fail", 1: "Pass"}

discretised_data["absences"] = discretised_data["absences"].map(absences_map)
discretised_data["G1"] = discretised_data["G1"].map(G1_map)
discretised_data["G2"] = discretised_data["G2"].map(G2_map)
discretised_data["G3"] = discretised_data["G3"].map(G3_map)


# 데이터 분할
train, test = train_test_split(discretised_data, train_size=0.9, test_size=0.1, random_state=7)


# 데이터 구조 모델 (2~3분 소요)
start = time.time()

sm = from_pandas(structure_data)
sm.remove_edges_below_threshold(0.8)

sm = from_pandas(structure_data, tabu_edges=[("higher", "Medu")], w_threshold=0.8)
sm.add_edge("failures", "G1")
sm.remove_edge("Pstatus", "G1")
sm.remove_edge("address", "G1")

sm = sm.get_largest_subgraph()

end = time.time() - start
print(int(end))


# 베이지안 네트워크 모델 선언
bn = BayesianNetwork(sm)
Beispiel #25
0
#First causal nex model

from causalnex.structure import StructureModel
from causalnex.plots import plot_structure
import pandas as pd
from causalnex.structure.notears import from_pandas
from causalnex.network import BayesianNetwork
from sklearn.model_selection import train_test_split

data = pd.read_csv('../data/hmeq_clean.csv', delimiter=',')
data = data.apply(pd.to_numeric, errors='coerce')
data.drop(columns=['Unnamed: 0'], inplace=True)

sm = from_pandas(data)


def determine_structure():

    _, _, _ = plot_structure(sm)

    sm.remove_edges_below_threshold(0.8)
    _, _, _ = plot_structure(sm)
    """
    Now I have to determine what relationships are right.
    I can see that BAD determines VALUE and MORTDUE when it should be the other way
    round. SO I am going to change the arrows. 
    """
    sm.remove_edge("BAD", "VALUE")
    sm.remove_edge("BAD", "MORTDUE")
    sm.remove_edge("BAD", "LOAN")
    sm.add_edge("MORTDUE", "BAD")
Beispiel #26
0
genotypes = pd.concat(cultivar, axis=1)
genotype_uniq = genotypes.drop_duplicates()
genotype_uniq.set_axis(['genotype', 'encoding'], axis=1, inplace=True)
genotype_map = dict(zip(genotype_uniq.genotype, genotype_uniq.encoding))

# hardcoded seasons as dict
season_map = dict({'season_4': 0, 'season_6': 1})

with open("~/work/phenophasebbn/bbn/genotype_map.json", "w") as outfile:
    json.dump(genotype_map, outfile)
with open("~/work/phenophasebbn/bbn/season_map.json", "w") as outfile:
    json.dump(season_map, outfile)

# learn structure with NOTEARS, over 1000 iterations,and keep edge weights > 0.95
from causalnex.structure.notears import from_pandas
sm = from_pandas(X=dum_df, max_iter=1000, w_threshold=0.95)
#pickle the structure model
import pickle
# make pickle file binary
smp = open("~/work/phenophasebbn/bbn/nt_sm", "wb")
# dump the pickle; syntax = (model, filename)
pickle.dump(sm, smp)
# close the pickle
smp.close()

#output plot of learned graph
# no need to apply thresholding, since this is taken care of in the sm with w_threshold
from causalnex.plots import plot_structure
viz = plot_structure(sm)
viz.draw("sm_plot.png")
print("Finished writing metadata for encoding categoricals...")

print("Begin embedding expert knowledge into DAG...")

# learn structure with NOTEARS, over 1000 iterations,and keep edge weights > 0.95
#device = torch.cuda.is_available()
#print('GPU is available:', device)

print(
    "Attempting NO TEARS DAG structure learning with tabu edges and child noodes..."
)
from causalnex.structure.notears import from_pandas

learned_sm = from_pandas(X=dum_df,
                         max_iter=10,
                         w_threshold=0.95,
                         tabu_edges=bl_tup,
                         tabu_child_nodes=["season"])

print("Finished structure learning...begin pickling structure model.")
##pickle the learned structure model
# make pickle file binary
smp = open("/work/phenophasebbn/bbn/notears_sm.pickle", "wb")
# dump the pickle; syntax = (model, filename)
pickle.dump(learned_sm, smp)
# close the pickle
smp.close()

#print("Generating image of final DAG...")
#output plot of learned graph
# no need to apply thresholding, since this is taken care of in the sm with w_threshold
Beispiel #28
0
assert list(
    labelEncoder.fit_transform(y=testMultivals)) == [0, 1, 2, 3, 4, 5, 6, 7]

# %% markdown [markdown]
# Now apply the NOTEARS algo to learn the structure:

# %% codecell

from src.utils.Clock import *

from causalnex.structure.notears import from_pandas
import time

startTime: float = time.time()

structureModelLearned = from_pandas(X=labelEncData)

print(f"Time taken = {clock(startTime = startTime, endTime = time.time())}")

# %% codecell
# Now visualize it:
viz = plot_structure(structureModelLearned,
                     graph_attributes={"scale": "0.5"},
                     all_node_attributes=NODE_STYLE.WEAK,
                     all_edge_attributes=EDGE_STYLE.WEAK)
filename_learned = curPath + "structure_model_learnedStructure.png"

viz.draw(filename_learned)
Image(filename_learned)

# %% markdown [markdown]