def chain_network() -> BayesianNetwork: """ This Bayesian Model structure to test do interventions that split graph into subgraphs. a → b → c → d → e """ n = 50 nodes_names = list("abcde") random_binary_matrix = (np.random.randint(10, size=(n, len(nodes_names))) > 6).astype(int) df = pd.DataFrame(data=random_binary_matrix, columns=nodes_names) model = StructureModel() model.add_edges_from([ ("a", "b"), ("b", "c"), ("c", "d"), ("d", "e"), ]) chain_bn = BayesianNetwork(model) chain_bn = chain_bn.fit_node_states(df) chain_bn = chain_bn.fit_cpds(df, method="BayesianEstimator", bayes_prior="K2") return chain_bn
def test_all_nodes_exist(self): """Both connected and unconnected nodes should exist""" sm = StructureModel([("a", "b")]) sm.add_node("c") a_graph = plot_structure(sm) assert all(node in a_graph.nodes() for node in ["a", "b", "c"])
def __init__(self, vars: Union[int, List[str]], causal_graph: StructureModel = None, env_type: str = 'Switchboard', state_repeats: int = 1, allow_interventions: bool = True): self.allow_interventions = allow_interventions self.env_type = env_type if type(vars) == int: self.var_names = ['x' + str(i) for i in range(vars)] else: self.var_names = vars # initialize causal model if causal_graph: self.causal_model = causal_graph else: self.causal_model = StructureModel() [self.causal_model.add_node(name) for name in self.var_names] self.reset_causal_model() # initialize the storages for observational and interventional data. self.collected_data = {} self.action_space = None self.observation_space = None self.actions = [] self.current_action = None self.state_repeats = state_repeats
def test_f1score_generated(self, adjacency_mat_num_stability): """Structure learnt from regularisation should have very high f1 score relative to the ground truth""" df = pd.DataFrame( adjacency_mat_num_stability, columns=["a", "b", "c", "d", "e"], index=["a", "b", "c", "d", "e"], ) train_model = StructureModel(df.values) X = generate_continuous_dataframe(StructureModel(df), 50, noise_scale=1, seed=1) g = from_numpy(X[["a", "b", "c", "d", "e"]].values, lasso_beta=0.1, w_threshold=0.25) right_edges = train_model.edges n_predictions_made = len(g.edges) n_correct_predictions = len( set(g.edges).intersection(set(right_edges))) n_relevant_predictions = len(right_edges) precision = n_correct_predictions / n_predictions_made recall = n_correct_predictions / n_relevant_predictions f1_score = 2 * (precision * recall) / (precision + recall) assert f1_score > 0.85
def test_all_states_included(self): """All states in a node should be included""" cg = StructureModel() cg.add_weighted_edges_from([("a", "b", 1)]) bn = BayesianNetwork(cg).fit_node_states( pd.DataFrame([[i, i] for i in range(10)], columns=["a", "b"])) assert all(v in bn.node_states["a"] for v in range(10))
def train_model() -> StructureModel: """ This Bayesian Model structure will be used in all tests, and all fixtures will adhere to this structure. Cause-only nodes: [d, e] Effect-only nodes: [a, c] Cause / Effect nodes: [b] d ↙ ↓ ↘ a ← b → c ↑ ↗ e """ model = StructureModel() model.add_edges_from( [ ("b", "a"), ("b", "c"), ("d", "a"), ("d", "c"), ("d", "b"), ("e", "c"), ("e", "b"), ] ) return model
def test_fit_with_null_states_raises_error(self): """An error should be raised if fit is called with null data""" cg = StructureModel() cg.add_weighted_edges_from([("a", "b", 1)]) with pytest.raises(ValueError, match="node '.*' contains None state"): BayesianNetwork(cg).fit_node_states( pd.DataFrame([[None, 1]], columns=["a", "b"]))
def test_intercept(self, distribution, noise_scale): graph = StructureModel() graph.add_node("123") data_noint = generate_continuous_data( graph, n_samples=100000, distribution=distribution, noise_scale=noise_scale, seed=10, intercept=False, ) data_intercept = generate_continuous_data( graph, n_samples=100000, distribution=distribution, noise_scale=noise_scale, seed=10, intercept=True, ) assert not np.isclose(data_noint[:, 0].mean(), data_intercept[:, 0].mean()) assert np.isclose(data_noint[:, 0].std(), data_intercept[:, 0].std(), rtol=0.01)
def make_default_scm(radiomic_features=None): sm = StructureModel() if radiomic_features is None: edges_list = { 'biopsy_grade': ['response'], 'subtypes': ['response'], 'histology': ['response'], 'clinical_nodal_status': ['stage'], 'stage': ['response'], } else: edges_list = { 'Age': ['ovarian_status', *radiomic_features], 'biopsy_grade': ['response'], 'subtypes': ['response'], 'histology': ['response'], 'clinical_nodal_status': ['stage'], 'stage': ['response'], **{ radiomic_feature: ['response'] for radiomic_feature in radiomic_features } } edges_list = [(k, dep) for k, v in edges_list.items() for dep in v] sm.add_edges_from(edges_list) return sm
def test_intercept(self, distribution, n_categories, noise_scale): graph = StructureModel() graph.add_node("A") data_noint = generate_categorical_dataframe( graph, 100000, distribution, noise_scale=noise_scale, n_categories=n_categories, seed=10, intercept=False, ) data_intercept = generate_categorical_dataframe( graph, 100000, distribution, noise_scale=noise_scale, n_categories=n_categories, seed=10, intercept=True, ) # NOTE: as n_categories increases, the probability that at least one category with # intercept=True will be the same as intercept=False -> 1.0 num_similar = np.isclose(data_intercept.mean(axis=0), data_noint.mean(axis=0), atol=0.05, rtol=0).sum() assert num_similar < n_categories / 2
def test_add_weighted_edges_from_other(self): """edges added with other origin should throw an error""" sm = StructureModel() with pytest.raises(ValueError, match="^Unknown origin: must be one of.*$"): sm.add_weighted_edges_from([(1, 2, 0.5)], origin="other")
def test_isolates(self): """Should return None if the structure model only contains isolates""" nodes = [1, 3, 5, 2, 7] sm = StructureModel() sm.add_nodes_from(nodes) assert sm.get_largest_subgraph() is None
def test_number_of_nodes(self, num_nodes): """Length of each row in generated data equals num_nodes""" graph = StructureModel() edges = [(n, n + 1, 1) for n in range(num_nodes - 1)] graph.add_weighted_edges_from(edges) data = generate_binary_data(graph, 100, seed=10) assert all(len(sample) == num_nodes for sample in data)
def test_add_edge_unknown(self): """edges added with unknown origin should be labelled as unknown origin""" sm = StructureModel() sm.add_edge(1, 2, "unknown") assert (1, 2) in sm.edges assert (1, 2, "unknown") in sm.edges.data("origin")
def test_get_indices_empty_iterator(self, schema): graph = StructureModel() # add node without parents: graph.add_node(10) mapper = VariableFeatureMapper(schema) x = mapper.get_indices(graph.predecessors(10)) assert len(x) == 0 assert isinstance(x, list)
def test_add_edge_custom_attr(self): """it should be possible to add an edge with custom attributes""" sm = StructureModel() sm.add_edge(1, 2, x="Y") assert (1, 2) in sm.edges assert (1, 2, "Y") in sm.edges.data("x")
def test_instance(self): """The subgraph returned should still be a StructureModel instance""" sm = StructureModel() sm.add_edges_from([(0, 1), (1, 2), (1, 3), (4, 6)]) subgraph = sm.get_target_subgraph(2) assert isinstance(subgraph, StructureModel)
def test_add_edge_expert(self): """edges added with expert origin should be labelled as expert origin""" sm = StructureModel() sm.add_edge(1, 2, "expert") assert (1, 2) in sm.edges assert (1, 2, "expert") in sm.edges.data("origin")
def test_add_edge_learned(self): """edges added with learned origin should be labelled as learned origin""" sm = StructureModel() sm.add_edge(1, 2, "learned") assert (1, 2) in sm.edges assert (1, 2, "learned") in sm.edges.data("origin")
def test_add_edge_default(self): """edges added with default origin should be identified as unknown origin""" sm = StructureModel() sm.add_edge(1, 2) assert (1, 2) in sm.edges assert (1, 2, "unknown") in sm.edges.data("origin")
def test_add_weighted_edges_from_custom_attr(self): """it should be possible to add edges with custom attributes""" sm = StructureModel() edges = [(1, 2, 0.5), (2, 3, 0.5)] sm.add_weighted_edges_from(edges, x="Y") assert all((u, v, w) in sm.edges.data("weight") for u, v, w in edges) assert all((u, v, "Y") in sm.edges.data("x") for u, v, _ in edges)
def test_add_weighted_edges_from_expert(self): """edges added with expert origin should be labelled as expert origin""" sm = StructureModel() edges = [(1, 2, 0.5), (2, 3, 0.5)] sm.add_weighted_edges_from(edges, origin="expert") assert all((u, v, w) in sm.edges.data("weight") for u, v, w in edges) assert all((u, v, "expert") in sm.edges.data("origin") for u, v, w in edges)
def test_add_weighted_edges_from_default(self): """edges added with default origin should be identified as unknown origin""" sm = StructureModel() edges = [(1, 2, 0.5), (2, 3, 0.5)] sm.add_weighted_edges_from(edges) assert all((u, v, w) in sm.edges.data("weight") for u, v, w in edges) assert all((u, v, "unknown") in sm.edges.data("origin") for u, v, w in edges)
def test_add_edges_from_custom_attr(self): """it should be possible to add edges with custom attributes""" sm = StructureModel() edges = [(1, 2), (2, 3)] sm.add_edges_from(edges, x="Y") assert all(edge in sm.edges for edge in edges) assert all((u, v, "Y") in sm.edges.data("x") for u, v in edges)
def test_add_edges_from_expert(self): """edges added with expert origin should be labelled as expert origin""" sm = StructureModel() edges = [(1, 2), (2, 3)] sm.add_edges_from(edges, "expert") assert all(edge in sm.edges for edge in edges) assert all((u, v, "expert") in sm.edges.data("origin") for u, v in edges)
def test_zero_lambda(self): """ A wrong initialisation could lead to counts always being zero if they dont have parents. """ graph = StructureModel() graph.add_nodes_from(list(range(20))) df = generate_count_dataframe(graph, 10000) assert not np.any(df.mean() == 0)
def test_to_undirected(self): """should create an undirected Graph""" sm = StructureModel() sm.add_edges_from([(1, 2), (2, 1), (2, 3), (3, 4)]) udg = sm.to_undirected() assert all(edge in udg.edges for edge in [(2, 3), (3, 4)]) assert (1, 2) in udg.edges or (2, 1) in udg.edges assert len(udg.edges) == 3
def test_to_directed(self): """should create a structure model""" sm = StructureModel() edges = [(1, 2), (2, 1), (2, 3), (3, 4)] sm.add_edges_from(edges) dag = sm.to_directed() assert isinstance(dag, StructureModel) assert all(edge in dag.edges for edge in edges)
def test_number_of_columns(self, num_nodes, n_categories): """Length of dataframe is in the correct shape""" graph = StructureModel() edges = [(n, n + 1, 1) for n in range(num_nodes - 1)] graph.add_weighted_edges_from(edges) data = generate_categorical_dataframe(graph, 100, seed=10, n_categories=n_categories) assert data.shape[1] == (num_nodes * n_categories)
def test_fit_with_missing_feature_in_data(self): """An error should be raised if fit is called with missing feature in data""" cg = StructureModel() cg.add_weighted_edges_from([("a", "e", 1)]) with pytest.raises( KeyError, match="The data does not cover all the features found in the Bayesian Network. " "Please check the following features: {'e'}", ): BayesianNetwork(cg).fit_node_states( pd.DataFrame([[1, 1, 1, 1]], columns=["a", "b", "c", "d"]) )