def test_do_accepts_all_state_probabilities(self, bn): """Do should accept a map of state->p and update p accordingly""" ie = InferenceEngine(bn) ie.do_intervention("d", {False: 0.7, True: 0.3}) assert math.isclose(ie.query()["d"][False], 0.7) assert math.isclose(ie.query()["d"][True], 0.3)
def test_do_reflected_in_query(self, bn): """Do should adjust marginals returned by query when given a different observation""" ie = InferenceEngine(bn) assert ie.query({"a": "b"})["d"][True] != 1 ie.do_intervention("d", True) assert ie.query({"a": "b"})["d"][True] == 1
def test_do_reflected_in_query(self, train_model, train_data_idx): """Do should adjust marginals returned by query when given a different observation""" bn = BayesianNetwork(train_model) bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx) ie = InferenceEngine(bn) assert ie.query({"a": 1})["d"][1] != 1 ie.do_intervention("d", 1) assert ie.query({"a": 1})["d"][1] == 1
def test_observations_affect_marginals(self, bn): """Observing the state of a node should affect the marginals of dependent nodes""" ie = InferenceEngine(bn) m1 = ie.query({}) m2 = ie.query({"d": True}) assert m2["d"][False] == 0 assert m2["d"][True] == 1 assert not math.isclose(m2["b"]["x"], m1["b"]["x"], abs_tol=0.05)
def test_observations_does_not_affect_marginals_of_independent_nodes(self, bn): """Observing the state of a node should not affect the marginal probability of an independent node""" ie = InferenceEngine(bn) m1 = ie.query({}) m2 = ie.query({"d": True}) assert m2["d"][False] == 0 assert m2["d"][True] == 1 assert math.isclose(m2["e"][True], m1["e"][True], abs_tol=0.05)
def test_reset_do_sets_probabilities_back_to_initial_state( self, bn, train_data_discrete_marginals): """Resetting Do operator should re-introduce the original conditional dependencies""" ie = InferenceEngine(bn) ie.do_intervention("d", {False: 0.7, True: 0.3}) ie.reset_do("d") assert math.isclose(ie.query()["d"][False], train_data_discrete_marginals["d"][False]) assert math.isclose(ie.query()["d"][False], train_data_discrete_marginals["d"][False])
def test_observations_affect_marginals(self, train_model, train_data_idx): """Observing the state of a node should affect the marginals of dependent nodes""" bn = BayesianNetwork(train_model) bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx) ie = InferenceEngine(bn) m1 = ie.query({}) m2 = ie.query({"d": 1}) assert m2["d"][0] == 0 assert m2["d"][1] == 1 assert not math.isclose(m2["b"][1], m1["b"][1], abs_tol=0.01)
def test_reset_do_sets_probabilities_back_to_initial_state( self, train_model, train_data_idx, train_data_idx_marginals): """Resetting Do operator should re-introduce the original conditional dependencies""" bn = BayesianNetwork(train_model) bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx) ie = InferenceEngine(bn) ie.do_intervention("d", {0: 0.7, 1: 0.3}) ie.reset_do("d") assert math.isclose(ie.query()["d"][0], train_data_idx_marginals["d"][0]) assert math.isclose(ie.query()["d"][1], train_data_idx_marginals["d"][1])
def intervention(cls, input): """Users can apply an intervention to any node in the data, updating its distribution using a do operator, examining the effect of that intervention by querying marginals and resetting any interventions Args: input (a list of dictionaries): The data on which to do the interventions. """ from causalnex.inference import InferenceEngine bn = cls.get_model() ie = InferenceEngine(bn) i_node = input["node"] i_states = input["states"] i_target = input["target_node"] print(i_node, i_states, i_target) lst = [] # i_states is a list of dict for state in i_states: state = {int(k): int(v) for k, v in state.items()} ie.do_intervention(i_node, state) intervention_result = ie.query()[i_target] lst.append(intervention_result) print("Updated marginal", intervention_result) ie.reset_do(i_node) return lst
def test_do_sets_state_probability_to_one(self, train_model, train_data_idx): """Do should update the probability of the given observation=state to 1""" bn = BayesianNetwork(train_model) bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx) ie = InferenceEngine(bn) ie.do_intervention("d", 1) assert math.isclose(ie.query()["d"][1], 1)
def test_do_accepts_all_state_probabilities(self, train_model, train_data_idx): """Do should accept a map of state->p and update p accordingly""" bn = BayesianNetwork(train_model) bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx) ie = InferenceEngine(bn) ie.do_intervention("d", {0: 0.7, 1: 0.3}) assert math.isclose(ie.query()["d"][0], 0.7) assert math.isclose(ie.query()["d"][1], 0.3)
def test_do_sets_other_state_probabilitys_to_zero(self, train_model, train_data_idx): """Do should update the probability of every other state for the observation to zero""" bn = BayesianNetwork(train_model) bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx) ie = InferenceEngine(bn) ie.do_intervention("d", 1) assert ie.query()["d"][0] == 0
def test_empty_query_returns_marginals(self, bn, train_data_discrete_marginals): """An empty query should return all the marginal probabilities of the model's distribution""" ie = InferenceEngine(bn) dist = ie.query({}) for node, states in dist.items(): for state, p in states.items(): assert math.isclose( train_data_discrete_marginals[node][state], p, abs_tol=0.05 )
def test_empty_query_returns_marginals(self, train_model, train_data_idx, train_data_idx_marginals): """An empty query should return all the marginal probabilities of the model's distribution""" bn = BayesianNetwork(train_model) bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx) ie = InferenceEngine(bn) dist = ie.query({}) for node, states in dist.items(): for state, p in states.items(): assert math.isclose(train_data_idx_marginals[node][state], p, abs_tol=0.05)
def test_multi_query(self, bn): """Test query with a list of observations and multiprocessing""" ie = InferenceEngine(bn) results_parallel = ie.query([{ "a": "a", "b": "x" }, { "a": "c", "e": False }, { "b": "x" }], parallel=True) results_loop = ie.query([{ "a": "a", "b": "x" }, { "a": "c", "e": False }, { "b": "x" }], parallel=False) single_0 = ie.query({"a": "a", "b": "x"}) single_1 = ie.query({"a": "c", "e": False}) single_2 = ie.query({"b": "x"}) assert len(results_parallel) == 3 assert results_parallel == results_loop assert results_parallel[0]["a"]["a"] == 1 assert results_parallel[1]["e"][False] == 1 assert results_parallel[2]["b"]["x"] == 1 assert single_0 == results_parallel[0] assert single_1 == results_parallel[1] assert single_2 == results_parallel[2]
def marginal_probs(graph, query, observations, verbose=1): ''' [graph] : causalnex BayesianNetwork object [query] : str [observations] : dict ''' ie = InferenceEngine(graph) marginals = ie.query(observations) marg_probs = marginals[query] if verbose: print('Marginal probabilities of "{}" | {} = {}'.format( query, observations, marg_probs)) return marg_probs
def predict_using_all_nodes( bn: BayesianNetwork, data: pd.DataFrame, target_var: str, markov_blanket: bool = False, lv_name: str = "LV", ) -> pd.DataFrame: """ Compute marginals using all nodes Args: bn: Bayesian network data: Input dataframe target_var: Target variable name markov_blanket: Whether to compute marginals based only on Markov blanket of the target variable lv_name: Latent variable name Returns: Marginal dataframe """ # Extract columns of interest if markov_blanket: blanket = bn.structure.get_markov_blanket([target_var, lv_name]) cols_to_keep = blanket.nodes else: cols_to_keep = bn.nodes # Further drop target variable and latent variable (if applicable) cols_to_keep = [ col for col in cols_to_keep if col not in {target_var, lv_name} ] # Perform inference ie = InferenceEngine(bn) observations = data[cols_to_keep].to_dict(orient="records") marginals = [prob[target_var] for prob in ie.query(observations)] return pd.DataFrame(marginals)
def test_invalid_observations(self, train_model, train_data_idx): """Test with invalid observations type""" bn = BayesianNetwork(train_model) bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx) ie = InferenceEngine(bn) with pytest.raises( TypeError, match="Expecting observations to be a dict, list or None"): ie.query("123") with pytest.raises( TypeError, match="Expecting observations to be a dict, list or None"): ie.query({"123", "abc"}) with pytest.raises( TypeError, match="Expecting observations to be a dict, list or None"): ie.query(("123", "abc"))
bayesNetFull: BayesianNetwork = bayesNetFull.fit_cpds( data=discrData, method="BayesianEstimator", bayes_prior="K2") # %% markdown [markdown] # Get warnings, showing we are replacing the previously existing CPDs # # **Second**: For inference, must create a new `InferenceEngine` from our `BayesianNetwork`, which lets us query the model. The query method will compute the marginal likelihood of all states for all nodes. Query lets us get the marginal distributions, marginalizing to get rid of the conditioning variable(s) for each node variable. # %% codecell from causalnex.inference import InferenceEngine eng = InferenceEngine(bn=bayesNetFull) eng # %% markdown [markdown] # Query the baseline marginal distributions, which means querying marginals **as learned from data**: # %% codecell marginalDistLearned: Dict[str, Dict[str, float]] = eng.query() marginalDistLearned # %% codecell marginalDistLearned['address'] # %% codecell marginalDistLearned['G1'] # %% markdown [markdown] # Output tells us that `P(G1=Fail) ~ 0.25` and `P(G1 = Pass) ~ 0.75`. As a quick sanity check can compute what proportion of our data are `Fail` and `Pass`, should give nearly the same result: # %% codecell import numpy as np labels, counts = np.unique(discrData['G1'], return_counts=True) print(list(zip(labels, counts))) print('\nProportion failures = {}'.format(counts[0] / sum(counts)))
def test_em_algorithm(self): # pylint: disable=too-many-locals """ Test if `BayesianNetwork` works with EM algorithm. We use a naive bayes + parents + an extra node not related to the latent variable. """ # p0 p1 p2 # \ | / # z # / | \ # c0 c1 c2 # | # cc0 np.random.seed(22) data, sm, _, true_lv_values = naive_bayes_plus_parents( percentage_not_missing=0.1, samples=1000, p_z=0.7, p_c=0.7, ) data["cc_0"] = np.where( np.random.random(len(data)) < 0.5, data["c_0"], (data["c_0"] + 1) % 3) data.drop(columns=["z"], inplace=True) complete_data = data.copy(deep=True) complete_data["z"] = true_lv_values # Baseline model: the structure of the figure trained with complete data. We try to reproduce it complete_bn = BayesianNetwork( StructureModel(list(sm.edges) + [("c_0", "cc_0")])) complete_bn.fit_node_states_and_cpds(complete_data) # BN without latent variable: All `p`s are connected to all `c`s + `c0` ->`cc0` sm_no_lv = StructureModel([(f"p_{p}", f"c_{c}") for p in range(3) for c in range(3)] + [("c_0", "cc_0")]) bn = BayesianNetwork(sm_no_lv) bn.fit_node_states(data) bn.fit_cpds(data) # TEST 1: cc_0 does not depend on the latent variable so: assert np.all(bn.cpds["cc_0"] == complete_bn.cpds["cc_0"]) # BN with latent variable # When we add the latent variable, we add the edges in the image above # and remove the connection among `p`s and `c`s edges_to_add = list(sm.edges) edges_to_remove = [(f"p_{p}", f"c_{c}") for p in range(3) for c in range(3)] bn.add_node("z", edges_to_add, edges_to_remove) bn.fit_latent_cpds("z", [0, 1, 2], data, stopping_delta=0.001) # TEST 2: cc_0 CPD should remain untouched by the EM algorithm assert np.all(bn.cpds["cc_0"] == complete_bn.cpds["cc_0"]) # TEST 3: We should recover the correct CPDs quite accurately assert bn.cpds.keys() == complete_bn.cpds.keys() assert self.mean_absolute_error(bn.cpds, complete_bn.cpds) < 0.01 # TEST 4: Inference over recovered CPDs should be also accurate eng = InferenceEngine(bn) query = eng.query() n_rows = complete_data.shape[0] for node in query: assert (np.abs(query[node][0] - sum(complete_data[node] == 0) / n_rows) < 1e-2) assert (np.abs(query[node][1] - sum(complete_data[node] == 1) / n_rows) < 1e-2) # TEST 5: Inference using predict and predict_probability functions report = classification_report(bn, complete_data, "z") _, auc = roc_auc(bn, complete_data, "z") complete_report = classification_report(complete_bn, complete_data, "z") _, complete_auc = roc_auc(complete_bn, complete_data, "z") for category, metrics in report.items(): if isinstance(metrics, dict): for key, val in metrics.items(): assert np.abs(val - complete_report[category][key]) < 1e-2 else: assert np.abs(metrics - complete_report[category]) < 1e-2 assert np.abs(auc - complete_auc) < 1e-2
def test_do_sets_other_state_probabilitys_to_zero(self, bn): """Do should update the probability of every other state for the observation to zero""" ie = InferenceEngine(bn) ie.do_intervention("d", True) assert ie.query()["d"][False] == 0
def test_do_sets_state_probability_to_one(self, bn): """Do should update the probability of the given observation=state to 1""" ie = InferenceEngine(bn) ie.do_intervention("d", True) assert math.isclose(ie.query()["d"][True], 1)
bayesNetFull: BayesianNetwork = bayesNetFull.fit_cpds( data=data, method="BayesianEstimator", bayes_prior="K2") # %% markdown [markdown] # Get warnings, showing we are replacing the previously existing CPDs # # **Second**: For inference, must create a new `InferenceEngine` from our `BayesianNetwork`, which lets us query the model. The query method will compute the marginal likelihood of all states for all nodes. Query lets us get the marginal distributions, marginalizing to get rid of the conditioning variable(s) for each node variable. # %% codecell from causalnex.inference import InferenceEngine eng = InferenceEngine(bn=bayesNetFull) eng # %% markdown [markdown] # Query the baseline marginal distributions, which means querying marginals **as learned from data**: # %% codecell marginalDistLearned: Dict[str, Dict[str, float]] = eng.query() marginalDistLearned # %% codecell marginalDistLearned['injury_type'] # %% codecell marginalDistLearned['absenteeism_level'] # %% markdown [markdown] # As a quick sanity check can compute the corresponding proportion of our data , which should give nearly the same result: # %% codecell import numpy as np labels, counts = np.unique(data['absenteeism_level'], return_counts=True) print(list(zip(labels, counts)))
print('The prediction is \'{prediction}\''.format(prediction=predictions.loc[18, 'G1_prediction'])) print('The ground truth is \'{truth}\''.format(truth=discretised_data.loc[18, 'G1'])) # 평가 classification_report(bn, test, "G1") roc, auc = roc_auc(bn, test, "G1") print(auc) # 한계(Marginal) 확률 베이스라인 (위와 같음) bn = bn.fit_cpds(discretised_data, method="BayesianEstimator", bayes_prior="K2") # 모든 상태와 노드에 대해서 한계(Marginal) 우도(Likelihood) 계산 ie = InferenceEngine(bn) marginals = ie.query() print('Marginal Likelihood of Target: ', marginals["G1"]) # 실제 레이블 개수 분포를 세어서 계산한 우도와 비슷한지 확인 labels, counts = np.unique(discretised_data["G1"], return_counts=True) list(zip(labels, counts)) # 학습시간 변수 각각의 경우(레이블)에 대해서 한계 확률 계산해보기 marginals_short = ie.query({"studytime": "short-studytime"}) marginals_long = ie.query({"studytime": "long-studytime"}) print("Marginal G1 | Short Studtyime", marginals_short["G1"]) print("Marginal G1 | Long Studytime", marginals_long["G1"]) """ Marginal G1 | Short Studtyime {'Fail': 0.2776556433482524, 'Pass': 0.7223443566517477}
def test_query_after_do_intervention_has_split_graph(self, chain_network): """ chain network: a → b → c → d → e test 1. - do intervention on node c generates 2 graphs (a → b) and (c → d → e) - assert the query can be run (it used to hang before) - assert rest_do works """ ie = InferenceEngine(chain_network) original_margs = ie.query() var = "c" state_dict = {0: 1.0, 1: 0.0} ie.do_intervention(var, state_dict) # assert the intervention node has indeed the right state assert ie.query()[var][0] == state_dict[0] assert ie.query()[var][1] == state_dict[1] # assert the upstream nodes have the default marginals (no info # propagates in the upstream graph) assert ie.query()["a"][0] == original_margs["a"][0] assert ie.query()["a"][1] == original_margs["a"][1] assert ie.query()["b"][0] == original_margs["b"][0] assert ie.query()["b"][1] == original_margs["b"][1] # assert the _cpds of the upstream nodes are stored correctly orig_cpds = ie._cpds_original # pylint: disable=protected-access upstream_cpds = ie._detached_cpds # pylint: disable=protected-access assert orig_cpds["a"] == upstream_cpds["a"] assert orig_cpds["b"] == upstream_cpds["b"] ie.reset_do(var) reset_margs = ie.query() for node in original_margs.keys(): dict_left = original_margs[node] dict_right = reset_margs[node] for (kl, kr) in zip(dict_left.keys(), dict_right.keys()): assert math.isclose(dict_left[kl], dict_right[kr]) # repeating above tests intervening on b, so that there is one single # isolate var_b = "b" state_dict_b = {0: 1.0, 1: 0.0} ie.do_intervention(var_b, state_dict_b) # assert the intervention node has indeed the right state assert ie.query()[var_b][0] == state_dict[0] assert ie.query()[var_b][1] == state_dict[1] # assert the upstream nodes have the default marginals (no info # propagates in the upstream graph) assert ie.query()["a"][0] == original_margs["a"][0] assert ie.query()["a"][1] == original_margs["a"][1] # assert the _cpds of the upstream nodes are stored correctly orig_cpds = ie._cpds_original # pylint: disable=protected-access upstream_cpds = ie._detached_cpds # pylint: disable=protected-access assert orig_cpds["a"] == upstream_cpds["a"] ie.reset_do(var_b) reset_margs = ie.query() for node in original_margs.keys(): dict_left = original_margs[node] dict_right = reset_margs[node] for (kl, kr) in zip(dict_left.keys(), dict_right.keys()): assert math.isclose(dict_left[kl], dict_right[kr])
# * $\color{red}{\text{TODO}}:$ why is it true that there are equally likely probabilities everywhere else? # %% codecell bayesNet.cpds[AbsenteeismLevel.var] # %% markdown [markdown] # ## Step 4: Inference (querying marginals) # %% codecell from causalnex.inference import InferenceEngine eng = InferenceEngine(bn = bayesNet) # querying the baseline marginals as learned from the data marginalDist: Dict[Name, Dict[State, Probability]] = eng.query() marginalDist # %% markdown [markdown] # Checking marginal distribution of **work-capacity**: # %% codecell eng.query()[WorkCapacity.var] # %% markdown [markdown] # Biasing so that lower work capacity probability gets higher: # %% codecell # NOTE: in the data, in TIME + 30, when exertion, training, experience are all HIGH, the work-capacity = LOW eng.query({Time.var : 30, ExertionLevel.var : 'High', TrainingLevel.var : 'High', ExperienceLevel.var : 'High'})[WorkCapacity.var] # %% codecell # Different than data: at time = 30, in data all these exertion, experience, training are High, so testing what happens to workcapacity when they are set to Medium: eng.query({Time.var : 30, ExertionLevel.var : 'Medium', TrainingLevel.var : 'Medium', ExperienceLevel.var : 'Medium'})[WorkCapacity.var] # %% codecell