def test_auc_node_with_no_parents(self): """Should be possible to compute auc for state with no parent nodes""" train = pd.DataFrame( [[a, b, 0] for a in range(0, 2) for b in range(0, 2) for _ in range(1)] + [[a, b, 1] for a in range(0, 2) for b in range(0, 2) for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 0] for a in range(2, 4) for b in range(2, 4) for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1] for a in range(2, 4) for b in range(2, 4) for _ in range(1)], columns=["a", "b", "c"], ) cg = StructureModel() cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)]) bn = BayesianNetwork(cg) bn.fit_node_states(train) bn.fit_cpds(train) _, auc = roc_auc(bn, train, "a") assert math.isclose(auc, 0.5, abs_tol=0.01)
def test_auc_for_nonnumeric_features(self): """AUC of accurate predictions should be 1 even after remapping numbers to strings""" # equal class (c) weighting to guarantee high ROC expected train = pd.DataFrame( [[a, b, 0] for a in range(0, 2) for b in range(0, 2) for _ in range(1)] + [[a, b, 1] for a in range(0, 2) for b in range(0, 2) for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 0] for a in range(2, 4) for b in range(2, 4) for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1] for a in range(2, 4) for b in range(2, 4) for _ in range(1)], columns=["a", "b", "c"], ) # remap values in column c train["c"] = train["c"].map({0: "f", 1: "g"}) cg = StructureModel() cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)]) bn = BayesianNetwork(cg) bn.fit_node_states(train) bn.fit_cpds(train) _, auc = roc_auc(bn, train, "c") assert math.isclose(auc, 1, abs_tol=0.001)
def test_auc_of_accurate_predictions(self): """AUC of accurate predictions should be 1""" # equal class (c) weighting to guarantee high ROC expected train = pd.DataFrame( [[a, b, 0] for a in range(0, 2) for b in range(0, 2) for _ in range(1)] + [[a, b, 1] for a in range(0, 2) for b in range(0, 2) for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 0] for a in range(2, 4) for b in range(2, 4) for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1] for a in range(2, 4) for b in range(2, 4) for _ in range(1)], columns=["a", "b", "c"], ) cg = StructureModel() cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)]) bn = BayesianNetwork(cg) bn.fit_node_states(train) bn.fit_cpds(train) _, auc = roc_auc(bn, train, "c") assert math.isclose(auc, 1, abs_tol=0.001)
def test_auc_with_missing_state_in_test(self): """AUC should still be calculated correctly with states missing in test set""" # equal class (c) weighting to guarantee high ROC expected train = pd.DataFrame( [[a, b, 0] for a in range(0, 2) for b in range(0, 2) for _ in range(1)] + [[a, b, 1] for a in range(0, 2) for b in range(0, 2) for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 0] for a in range(2, 4) for b in range(2, 4) for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1] for a in range(2, 4) for b in range(2, 4) for _ in range(1)], columns=["a", "b", "c"], ) test = train[train["c"] == 1] assert len(test["c"].unique()) == 1 cg = StructureModel() cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)]) bn = BayesianNetwork(cg) bn.fit_node_states(train) bn.fit_cpds(train) _, auc = roc_auc(bn, test, "c") assert math.isclose(auc, 1, abs_tol=0.01)
def test_roc_of_random_has_unit_gradient(self): """The ROC curve for random predictions should be a line from (0,0) to (1,1)""" # regardless of a or b, c=1 is always more likely to varying amounts (to create multiple threshold # points in roc curve) train = pd.DataFrame( [[a, b, 0] for a in range(3) for b in range(3) for _ in range(1)] + [[a, b, 1] for a in range(3) for b in range(3) for _ in range(a * 1000 + b * 1000 + 1000)], columns=["a", "b", "c"], ) cg = StructureModel() cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)]) bn = BayesianNetwork(cg) bn.fit_node_states(train) bn.fit_cpds(train) assert np.allclose(bn.cpds["c"].loc[1].values, 1, atol=0.02) test = pd.DataFrame( [[a, b, random.randint(0, 1)] for a in range(3) for b in range(3) for _ in range(1000)], columns=["a", "b", "c"], ) roc, _ = roc_auc(bn, test, "c") assert len(roc) > 3 assert all(math.isclose(a, b, abs_tol=0.03) for a, b in roc)
def test_roc_of_accurate_predictions(self): """TPR should always be better than FPR for accurate predictions""" # equal class (c) weighting to guarantee high ROC expected train = pd.DataFrame( [[a, b, 0] for a in range(0, 2) for b in range(0, 2) for _ in range(10)] + [[a, b, 1] for a in range(0, 2) for b in range(0, 2) for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 0] for a in range(2, 4) for b in range(2, 4) for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1] for a in range(2, 4) for b in range(2, 4) for _ in range(10)], columns=["a", "b", "c"], ) cg = StructureModel() cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)]) bn = BayesianNetwork(cg) bn.fit_node_states(train) bn.fit_cpds(train) roc, _ = roc_auc(bn, train, "c") assert all(tpr > fpr for fpr, tpr in roc if tpr not in [0.0, 1.0])
def test_auc_of_incorrect_close_to_zero(self): """The AUC of incorrect predictions should be close to zero""" # regardless of a or b, c=1 is always more likely to varying amounts (to create multiple threshold # points in roc curve) train = pd.DataFrame( [[a, b, 0] for a in range(3) for b in range(3) for _ in range(1)] + [[a, b, 1] for a in range(3) for b in range(3) for _ in range(a * 1000 + b * 1000 + 1000)], columns=["a", "b", "c"], ) cg = StructureModel() cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)]) bn = BayesianNetwork(cg) bn.fit_node_states(train) bn.fit_cpds(train) assert np.allclose(bn.cpds["c"].loc[1].values, 1, atol=0.02) # in test, c=0 is always more likely (opposite of train) test = pd.DataFrame( [[a, b, 0] for a in range(3) for b in range(3) for _ in range(1000)] + [[a, b, 1] for a in range(3) for b in range(3) for _ in range(1)], columns=["a", "b", "c"], ) _, auc = roc_auc(bn, test, "c") assert math.isclose(auc, 0, abs_tol=0.001)
def compare_result_with_ideal( em_cpds: Dict[str, pd.DataFrame], sm: StructureModel, data: pd.DataFrame, true_values_lv: np.array, node_states: Dict[AnyStr, Union[List, Set]], ) -> Tuple[float, float]: """ Compare learned CPDs with ideal CPDs Args: em_cpds: Learned CPDs for different nodes sm: Structure model data: Input dataset true_values_lv: Ideal values of the latent variable node_states: Possible tates of different nodes Returns: Maximum absolute difference and root mean square of differences """ data["z"] = true_values_lv.reshape(-1) bn = BayesianNetwork(sm) bn.fit_node_states(states_to_df(node_states)) bn.fit_cpds(data) max_delta = -1 avg_delta = 0 for node in em_cpds: deltas = (em_cpds[node] - bn.cpds[node]).abs().values max_delta = max(max_delta, deltas.max()) avg_delta += np.mean(deltas ** 2) avg_delta = np.sqrt(avg_delta / len(em_cpds)) return max_delta, avg_delta
def test_roc_of_incorrect_has_fpr_lt_tpr(self): """The ROC of incorrect predictions should have FPR < TPR""" # regardless of a or b, c=1 is always more likely to varying amounts (to create multiple threshold # points in roc curve) train = pd.DataFrame( [[a, b, 0] for a in range(3) for b in range(3) for _ in range(1)] + [[a, b, 1] for a in range(3) for b in range(3) for _ in range(a * 1000 + b * 1000 + 1000)], columns=["a", "b", "c"], ) cg = StructureModel() cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)]) bn = BayesianNetwork(cg) bn.fit_node_states(train) bn.fit_cpds(train) assert np.allclose(bn.cpds["c"].loc[1].values, 1, atol=0.02) # in test, c=0 is always more likely (opposite of train) test = pd.DataFrame( [[a, b, 0] for a in range(3) for b in range(3) for _ in range(1000)] + [[a, b, 1] for a in range(3) for b in range(3) for _ in range(1)], columns=["a", "b", "c"], ) roc, _ = roc_auc(bn, test, "c") assert len(roc) > 3 assert all(fpr > tpr for fpr, tpr in roc if tpr not in [0.0, 1.0])
def test_report_ignores_unrequired_columns_in_data(self, train_data_idx, train_data_discrete, test_data_c_discrete): """Classification report should ignore any columns that are no needed by predict""" bn = BayesianNetwork( from_pandas(train_data_idx, w_threshold=0.3)).fit_node_states(train_data_discrete) train_data_discrete["NEW_COL"] = [1] * len(train_data_discrete) bn.fit_cpds(train_data_discrete) classification_report(bn, test_data_c_discrete, "c")
def test_create_inference_with_bad_variable_names_fails( self, train_model, train_data_idx): model = StructureModel() model.add_edges_from([(str(u).replace("a", "$a"), str(v).replace("a", "$a")) for u, v in train_model.edges]) train_data_idx.rename(columns={"a": "$a"}, inplace=True) bn = BayesianNetwork(model).fit_node_states(train_data_idx) bn.fit_cpds(train_data_idx) with pytest.raises(ValueError, match="Variable names must match.*"): InferenceEngine(bn)
def chain_network() -> BayesianNetwork: """ This Bayesian Model structure to test do interventions that split graph into subgraphs. a → b → c → d → e """ n = 50 nodes_names = list("abcde") random_binary_matrix = (np.random.randint(10, size=(n, len(nodes_names))) > 6).astype(int) df = pd.DataFrame(data=random_binary_matrix, columns=nodes_names) model = StructureModel() model.add_edges_from([ ("a", "b"), ("b", "c"), ("c", "d"), ("d", "e"), ]) chain_bn = BayesianNetwork(model) chain_bn = chain_bn.fit_node_states(df) chain_bn = chain_bn.fit_cpds(df, method="BayesianEstimator", bayes_prior="K2") return chain_bn
def train_bn(data, graph): bn = BayesianNetwork(graph) bn = bn.fit_node_states(data) bn = bn.fit_cpds(data, method='BayesianEstimator', bayes_prior='K2') return bn
def test_fit_missing_states(self): """test issues/15: should be possible to fit with missing states""" sm = StructureModel([("a", "b"), ("c", "b")]) bn = BayesianNetwork(sm) train = pd.DataFrame(data=[[0, 0, 1], [1, 0, 1], [1, 1, 1]], columns=["a", "b", "c"]) test = pd.DataFrame(data=[[0, 0, 1], [1, 0, 1], [1, 1, 2]], columns=["a", "b", "c"]) data = pd.concat([train, test]) bn.fit_node_states(data) bn.fit_cpds(train, method="BayesianEstimator", bayes_prior="K2") assert bn.cpds["c"].loc[1][0] == 0.8 assert bn.cpds["c"].loc[2][0] == 0.2
def get_avg_auc_all_info( df: pd.DataFrame, bn: BayesianNetwork, n_splits: int = 5, seed: int = 2021, n_cpus: int = multiprocessing.cpu_count() - 1, ) -> float: """ Utility function to compute AUC using all nodes beyond the parent nodes Args: df: Input dataframe bn: Bayesian network n_splits: Number of cross-validation folds seed: Random seed number n_cpus: Number of CPU cores to use Returns: Average AUC """ bn.fit_node_states(df) cv = KFold(n_splits=n_splits, shuffle=True, random_state=seed) total_auc = 0 for fold, (train_idx, test_idx) in enumerate(cv.split(df)): t0 = time() train_df = df.loc[train_idx, :] test_df = df.loc[test_idx, :] bn.fit_cpds(train_df, method="BayesianEstimator", bayes_prior="K2") chunks = [[bn, test_df, target] for target in bn.nodes] with multiprocessing.Pool(n_cpus) as p: result = p.starmap(_compute_auc_stub, chunks) total_auc += sum(result) / len(bn.nodes) print( f"Processing fold {fold} using {n_cpus} cores takes {time() - t0} seconds" ) return total_auc / n_splits
def get_auc_data( df: pd.DataFrame, bn: BayesianNetwork, n_splits: int = 5, seed: int = 2021, ) -> pd.Series: """ Utility function to compute AUC based only on data observations Args: df: Input dataframe bn: Bayesian network n_splits: Number of cross-validation folds seed: Random seed number Returns: Average AUC """ bn.fit_node_states(df) cv = KFold(n_splits=n_splits, shuffle=True, random_state=seed) nodes_auc = defaultdict(list) for fold, (train_idx, test_idx) in enumerate(cv.split(df)): t0 = time() train_df = df.loc[train_idx, :] test_df = df.loc[test_idx, :] bn.fit_cpds(train_df, method="BayesianEstimator", bayes_prior="K2") for var in bn.nodes: _, auc = roc_auc(bn, test_df, var) nodes_auc[var].append(auc) print(f"Processing fold {fold} takes {time() - t0} seconds") nodes_auc = pd.DataFrame(nodes_auc) col = nodes_auc.mean(axis=0).idxmin() val = nodes_auc.mean(axis=0).min() print(f"Variable with lowest AUC is {col} with the value of {val}") return nodes_auc.mean().sort_values()
def get_avg_auc( df: pd.DataFrame, bn: BayesianNetwork, n_splits: int = 5, seed: int = 2021, ) -> float: """ Estimate the average auc of all nodes in a Bayesian Network given a structure and a dataset using k-fold cross-validation. This function uses the bn.predict method in causalnex and cannot be used with latent variable models Args: df: a dataset in the pandas format bn: a bayesian network EM object n_splits: Number of folds in k-fold cv seed: random seed used in k-fold cv Returns: Average AUC """ bn.fit_node_states(df) cv = KFold(n_splits=n_splits, shuffle=True, random_state=seed) total_auc = 0 for fold, (train_idx, test_idx) in enumerate(cv.split(df)): t0 = time() cur_auc = 0 train_df = df.loc[train_idx, :] test_df = df.loc[test_idx, :] bn.fit_cpds(train_df, method="BayesianEstimator", bayes_prior="K2") for var in bn.nodes: _, auc = roc_auc(bn, test_df, var) cur_auc += auc print(f"Processing fold {fold} takes {time() - t0} seconds") total_auc += cur_auc / len(bn.nodes) return total_auc / n_splits
def get_correct_cpds( df: pd.DataFrame, sm: StructureModel, node_states: Dict, true_lv_values: np.array, ) -> pd.DataFrame: """ Get the cpds obtained if complete data was provided (no latent variable) Args: df: Input dataset sm: Structure model node_states: Dictionary of node states true_lv_values: True values of latent variable Returns: Ground-truth CPDs """ data = df.copy() data["z"] = true_lv_values bn = BayesianNetwork(sm) bn.fit_node_states(states_to_df(node_states)) bn.fit_cpds(data) return bn.cpds
class BayesianNetworkClassifier(BaseEstimator, ClassifierMixin): """ A class that supports discretising features and probability fitting with scikit-learn syntax Example: :: # Dataset is from https://archive.ics.uci.edu/ml/datasets/student+performance >>> import pandas as pd >>> import numpy as np >>> from sklearn.preprocessing import LabelEncoder >>> from causalnex.discretiser import Discretiser >>> from causalnex.network.sklearn import BayesianNetworkClassifier >>> from sklearn.model_selection import train_test_split >>> data = pd.read_csv('student-por.csv', delimiter=';') >>> drop_col = ['school','sex','age','Mjob', 'Fjob','reason','guardian'] >>> data = data.drop(columns=drop_col) >>> non_numeric_columns = list(data.select_dtypes(exclude=[np.number]).columns) >>> le = LabelEncoder() >>> for col in non_numeric_columns: >>> data[col] = le.fit_transform(data[col]) >>> data["G3"] = Discretiser(method="fixed", numeric_split_points=[10]).transform(data["G3"].values) >>> label = data["G3"] >>> data.drop(['G3'], axis=1, inplace=True) >>> X_train, X_test, y_train, y_test = train_test_split( data, label, test_size=0.1, random_state=7) >>> edge_list = [('address', 'absences'), ('Pstatus', 'famrel'), ('Pstatus', 'absences'), ('studytime', 'G1'), ('G1', 'G2'), ('failures', 'absences'), ('failures', 'G1'), ('schoolsup', 'G1'), ('paid', 'absences'), ('higher', 'famrel'), ('higher', 'G1'), ('internet', 'absences'), ('G2', 'G3')] >>> discretiser_param = { 'absences': {'method':"fixed", 'numeric_split_points':[1, 10] }, 'G1': {'method':"fixed", 'numeric_split_points':[10] }, 'G2': {'method':"fixed", 'numeric_split_points':[10] } } >>> discretiser_alg = {'absences': 'unsupervised', 'G1': 'unsupervised', 'G2': 'unsupervised' } >>> bayesian_param = {'method':"BayesianEstimator", 'bayes_prior':"K2"} >>> clf = BayesianNetworkClassifier(edge_list, discretiser_alg, discretiser_param, bayesian_param) >>> clf.fit(X_train, y_train) >>> clf.predict(X_test) array([1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0]) """ def __init__( self, list_of_edges: List[Tuple[str]], discretiser_alg: Optional[Dict[str, str]] = None, discretiser_kwargs: Optional[Dict[str, Dict[str, Any]]] = None, probability_kwargs: Dict[str, Dict[str, Any]] = None, return_prob: bool = False, ): """ Args: list_of_edges (list): Edge list to construct graph - if True: return pandas dataframe with predicted probability for each state - if False: return a 1-D prediction array discretiser_alg (dict): Specify a supervised algorithm to discretise each feature in the data. Available options for the dictionary values are ['unsupervised', 'tree', 'mdlp'] - if 'unsupervised': discretise the data using unsupervised method - if 'tree': discretise the data using decision tree method - if 'mdlp': discretise the data using MDLP method discretiser_kwargs (dict): Keyword arguments for discretisation methods. Only applicable if discretiser_alg is not None. probability_kwargs (dict): keyword arguments for the probability model return_prob (bool): choose to return predictions or probability Raises: KeyError: If an incorrect argument is passed ValueError: If the keys in discretiser_alg and discretiser_kwargs differ """ probability_kwargs = probability_kwargs or { "method": "BayesianEstimator", "bayes_prior": "K2", } if discretiser_alg is None: logging.info("No discretiser algorithm was given " "The training data will not be discretised") discretiser_alg = {} discretiser_kwargs = discretiser_kwargs or {} self._validate_discretiser(discretiser_alg, discretiser_kwargs) self.list_of_edges = list_of_edges self.structure = StructureModel(self.list_of_edges) self.bn = BayesianNetwork(self.structure) self.return_prob = return_prob self.probability_kwargs = probability_kwargs self.discretiser_kwargs = discretiser_kwargs self.discretiser_alg = discretiser_alg self._target_name = None self._discretise_data = None @staticmethod def _validate_discretiser(discretiser_alg, discretiser_kwargs): unavailable_discretiser_algs = { k: v not in ["unsupervised", "tree", "mdlp"] for k, v in discretiser_alg.items() } if any(unavailable_discretiser_algs.values()): algs = { k: discretiser_alg[k] for k, v in unavailable_discretiser_algs.items() if v } raise KeyError( f"Some discretiser algorithms are not supported: `{algs}`. " "Please choose in ['unsupervised', 'tree', 'mdlp']") if set(discretiser_kwargs) != set(discretiser_alg): raise ValueError( "discretiser_alg and discretiser_kwargs should have the same keys" ) def _discretise_features(self, X: pd.DataFrame) -> pd.DataFrame: """ Helper method to discretise input data using parameters in `discretiser_kwargs` and `discretiser_alg`. The splitting thresholds are extracted from the training data Args: X (pd.DataFrame): a dataframe to be discretised Returns: a discretised version of the input dataframe """ X = X.copy() for col in self.discretiser_alg.keys(): if self.discretiser_alg[col] == "unsupervised": if self.discretiser_kwargs[col]["method"] == "fixed": X[col] = Discretiser( **self.discretiser_kwargs[col]).transform( X[col].values) else: discretiser = Discretiser( **self.discretiser_kwargs[col]).fit( self._discretise_data[col].values) X[col] = discretiser.transform(X[col].values) else: if self.discretiser_alg[col] == "tree": discretiser = DecisionTreeSupervisedDiscretiserMethod( mode="single", tree_params=self.discretiser_kwargs[col]) elif self.discretiser_alg[col] == "mdlp": discretiser = MDLPSupervisedDiscretiserMethod( self.discretiser_kwargs[col]) discretiser.fit( dataframe=self._discretise_data, feat_names=[col], target=self._target_name, target_continuous=False, ) X[col] = discretiser.transform(X[[col]]) return X def fit(self, X: pd.DataFrame, y: pd.Series) -> "BayesianNetworkClassifier": """ Build a Bayesian Network classifier from a set of training data. The method first discretises the feature using parameters in `discretiser_kwargs` and `discretiser_alg`. Next, it learns all the possible nodes that each feature can have. Finally, it learns the CPDs of the Bayesian Network. Args: X (pd.DataFrame): input training data y (pd.Series): categorical label for each row of X Returns: self """ self._discretise_data = X.copy() self._discretise_data[y.name] = y self._target_name = y.name X = self._discretise_features(X) X[y.name] = y self.bn = self.bn.fit_node_states(X) self.bn = self.bn.fit_cpds(X, **self.probability_kwargs) return self def predict(self, X: pd.DataFrame) -> Union[pd.DataFrame, np.ndarray]: """ Return predictions for the input data Args: X (pd.DataFrame): A dataframe of shape (num_row, num_features) for model to predict Returns: Model's prediction: A numpy array of shape (num_row,) Raises: ValueError: if CPDs are empty """ if self.bn.cpds == {}: raise ValueError("No CPDs found. The model has not been fitted") X = self._discretise_features(X) if self.return_prob: pred = self.bn.predict_probability(X, self._target_name) else: pred = self.bn.predict(X, self._target_name).to_numpy().reshape(-1) return pred
def test_create_inference_from_bn(self, train_model, train_data_idx): """It should be possible to create a new Inference object from an existing pgmpy model""" bn = BayesianNetwork(train_model).fit_node_states(train_data_idx) bn.fit_cpds(train_data_idx) InferenceEngine(bn)
def test_em_algorithm(self): # pylint: disable=too-many-locals """ Test if `BayesianNetwork` works with EM algorithm. We use a naive bayes + parents + an extra node not related to the latent variable. """ # p0 p1 p2 # \ | / # z # / | \ # c0 c1 c2 # | # cc0 np.random.seed(22) data, sm, _, true_lv_values = naive_bayes_plus_parents( percentage_not_missing=0.1, samples=1000, p_z=0.7, p_c=0.7, ) data["cc_0"] = np.where( np.random.random(len(data)) < 0.5, data["c_0"], (data["c_0"] + 1) % 3) data.drop(columns=["z"], inplace=True) complete_data = data.copy(deep=True) complete_data["z"] = true_lv_values # Baseline model: the structure of the figure trained with complete data. We try to reproduce it complete_bn = BayesianNetwork( StructureModel(list(sm.edges) + [("c_0", "cc_0")])) complete_bn.fit_node_states_and_cpds(complete_data) # BN without latent variable: All `p`s are connected to all `c`s + `c0` ->`cc0` sm_no_lv = StructureModel([(f"p_{p}", f"c_{c}") for p in range(3) for c in range(3)] + [("c_0", "cc_0")]) bn = BayesianNetwork(sm_no_lv) bn.fit_node_states(data) bn.fit_cpds(data) # TEST 1: cc_0 does not depend on the latent variable so: assert np.all(bn.cpds["cc_0"] == complete_bn.cpds["cc_0"]) # BN with latent variable # When we add the latent variable, we add the edges in the image above # and remove the connection among `p`s and `c`s edges_to_add = list(sm.edges) edges_to_remove = [(f"p_{p}", f"c_{c}") for p in range(3) for c in range(3)] bn.add_node("z", edges_to_add, edges_to_remove) bn.fit_latent_cpds("z", [0, 1, 2], data, stopping_delta=0.001) # TEST 2: cc_0 CPD should remain untouched by the EM algorithm assert np.all(bn.cpds["cc_0"] == complete_bn.cpds["cc_0"]) # TEST 3: We should recover the correct CPDs quite accurately assert bn.cpds.keys() == complete_bn.cpds.keys() assert self.mean_absolute_error(bn.cpds, complete_bn.cpds) < 0.01 # TEST 4: Inference over recovered CPDs should be also accurate eng = InferenceEngine(bn) query = eng.query() n_rows = complete_data.shape[0] for node in query: assert (np.abs(query[node][0] - sum(complete_data[node] == 0) / n_rows) < 1e-2) assert (np.abs(query[node][1] - sum(complete_data[node] == 1) / n_rows) < 1e-2) # TEST 5: Inference using predict and predict_probability functions report = classification_report(bn, complete_data, "z") _, auc = roc_auc(bn, complete_data, "z") complete_report = classification_report(complete_bn, complete_data, "z") _, complete_auc = roc_auc(complete_bn, complete_data, "z") for category, metrics in report.items(): if isinstance(metrics, dict): for key, val in metrics.items(): assert np.abs(val - complete_report[category][key]) < 1e-2 else: assert np.abs(metrics - complete_report[category]) < 1e-2 assert np.abs(auc - complete_auc) < 1e-2
sm.add_edge("failures", "G1") sm.remove_edge("Pstatus", "G1") sm.remove_edge("address", "G1") sm = sm.get_largest_subgraph() end = time.time() - start print(int(end)) # 베이지안 네트워크 모델 선언 bn = BayesianNetwork(sm) bn = bn.fit_node_states(discretised_data) # 조건부 확률 분포 (CPDS: Conditional Probability Distributions) 핏팅 bn = bn.fit_cpds(train, method="BayesianEstimator", bayes_prior="K2") # 타겟 확인 print(bn.cpds["G1"]) # 시험 G1 성적 - Pass/Fail # 타겟을 제외한 인풋(18번째 row) 확인 print(discretised_data.loc[18, discretised_data.columns != 'G1']) # 예측 predictions = bn.predict(discretised_data, "G1") print('The prediction is \'{prediction}\''.format(prediction=predictions.loc[18, 'G1_prediction'])) print('The ground truth is \'{truth}\''.format(truth=discretised_data.loc[18, 'G1'])) # 평가 classification_report(bn, test, "G1")