Exemple #1
0
    def test_auc_node_with_no_parents(self):
        """Should be possible to compute auc for state with no parent nodes"""

        train = pd.DataFrame(
            [[a, b, 0] for a in range(0, 2) for b in range(0, 2)
             for _ in range(1)] + [[a, b, 1] for a in range(0, 2)
                                   for b in range(0, 2)
                                   for _ in range(a * 10 + b * 10 + 1000)] +
            [[a, b, 0] for a in range(2, 4) for b in range(2, 4)
             for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1]
                                                        for a in range(2, 4)
                                                        for b in range(2, 4)
                                                        for _ in range(1)],
            columns=["a", "b", "c"],
        )

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        _, auc = roc_auc(bn, train, "a")
        assert math.isclose(auc, 0.5, abs_tol=0.01)
Exemple #2
0
    def test_auc_for_nonnumeric_features(self):
        """AUC of accurate predictions should be 1 even after remapping numbers to strings"""

        # equal class (c) weighting to guarantee high ROC expected
        train = pd.DataFrame(
            [[a, b, 0] for a in range(0, 2) for b in range(0, 2)
             for _ in range(1)] + [[a, b, 1] for a in range(0, 2)
                                   for b in range(0, 2)
                                   for _ in range(a * 10 + b * 10 + 1000)] +
            [[a, b, 0] for a in range(2, 4) for b in range(2, 4)
             for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1]
                                                        for a in range(2, 4)
                                                        for b in range(2, 4)
                                                        for _ in range(1)],
            columns=["a", "b", "c"],
        )

        # remap values in column c
        train["c"] = train["c"].map({0: "f", 1: "g"})

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        _, auc = roc_auc(bn, train, "c")
        assert math.isclose(auc, 1, abs_tol=0.001)
Exemple #3
0
    def test_auc_of_accurate_predictions(self):
        """AUC of accurate predictions should be 1"""

        # equal class (c) weighting to guarantee high ROC expected
        train = pd.DataFrame(
            [[a, b, 0] for a in range(0, 2) for b in range(0, 2)
             for _ in range(1)] + [[a, b, 1] for a in range(0, 2)
                                   for b in range(0, 2)
                                   for _ in range(a * 10 + b * 10 + 1000)] +
            [[a, b, 0] for a in range(2, 4) for b in range(2, 4)
             for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1]
                                                        for a in range(2, 4)
                                                        for b in range(2, 4)
                                                        for _ in range(1)],
            columns=["a", "b", "c"],
        )

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        _, auc = roc_auc(bn, train, "c")
        assert math.isclose(auc, 1, abs_tol=0.001)
Exemple #4
0
    def test_auc_with_missing_state_in_test(self):
        """AUC should still be calculated correctly with states missing in test set"""

        # equal class (c) weighting to guarantee high ROC expected
        train = pd.DataFrame(
            [[a, b, 0] for a in range(0, 2) for b in range(0, 2)
             for _ in range(1)] + [[a, b, 1] for a in range(0, 2)
                                   for b in range(0, 2)
                                   for _ in range(a * 10 + b * 10 + 1000)] +
            [[a, b, 0] for a in range(2, 4) for b in range(2, 4)
             for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1]
                                                        for a in range(2, 4)
                                                        for b in range(2, 4)
                                                        for _ in range(1)],
            columns=["a", "b", "c"],
        )

        test = train[train["c"] == 1]
        assert len(test["c"].unique()) == 1

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        _, auc = roc_auc(bn, test, "c")
        assert math.isclose(auc, 1, abs_tol=0.01)
Exemple #5
0
    def test_roc_of_random_has_unit_gradient(self):
        """The ROC curve for random predictions should be a line from (0,0) to (1,1)"""

        # regardless of a or b, c=1 is always more likely to varying amounts (to create multiple threshold
        # points in roc curve)
        train = pd.DataFrame(
            [[a, b, 0] for a in range(3) for b in range(3)
             for _ in range(1)] + [[a, b, 1] for a in range(3)
                                   for b in range(3)
                                   for _ in range(a * 1000 + b * 1000 + 1000)],
            columns=["a", "b", "c"],
        )

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        assert np.allclose(bn.cpds["c"].loc[1].values, 1, atol=0.02)

        test = pd.DataFrame(
            [[a, b, random.randint(0, 1)] for a in range(3) for b in range(3)
             for _ in range(1000)],
            columns=["a", "b", "c"],
        )

        roc, _ = roc_auc(bn, test, "c")

        assert len(roc) > 3
        assert all(math.isclose(a, b, abs_tol=0.03) for a, b in roc)
Exemple #6
0
    def test_roc_of_accurate_predictions(self):
        """TPR should always be better than FPR for accurate predictions"""

        # equal class (c) weighting to guarantee high ROC expected
        train = pd.DataFrame(
            [[a, b, 0] for a in range(0, 2) for b in range(0, 2)
             for _ in range(10)] + [[a, b, 1] for a in range(0, 2)
                                    for b in range(0, 2)
                                    for _ in range(a * 10 + b * 10 + 1000)] +
            [[a, b, 0] for a in range(2, 4) for b in range(2, 4)
             for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1]
                                                        for a in range(2, 4)
                                                        for b in range(2, 4)
                                                        for _ in range(10)],
            columns=["a", "b", "c"],
        )

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        roc, _ = roc_auc(bn, train, "c")
        assert all(tpr > fpr for fpr, tpr in roc if tpr not in [0.0, 1.0])
Exemple #7
0
    def test_auc_of_incorrect_close_to_zero(self):
        """The AUC of incorrect predictions should be close to zero"""

        # regardless of a or b, c=1 is always more likely to varying amounts (to create multiple threshold
        # points in roc curve)
        train = pd.DataFrame(
            [[a, b, 0] for a in range(3) for b in range(3)
             for _ in range(1)] + [[a, b, 1] for a in range(3)
                                   for b in range(3)
                                   for _ in range(a * 1000 + b * 1000 + 1000)],
            columns=["a", "b", "c"],
        )

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        assert np.allclose(bn.cpds["c"].loc[1].values, 1, atol=0.02)

        # in test, c=0 is always more likely (opposite of train)
        test = pd.DataFrame(
            [[a, b, 0] for a in range(3) for b in range(3)
             for _ in range(1000)] + [[a, b, 1] for a in range(3)
                                      for b in range(3) for _ in range(1)],
            columns=["a", "b", "c"],
        )

        _, auc = roc_auc(bn, test, "c")

        assert math.isclose(auc, 0, abs_tol=0.001)
def compare_result_with_ideal(
    em_cpds: Dict[str, pd.DataFrame],
    sm: StructureModel,
    data: pd.DataFrame,
    true_values_lv: np.array,
    node_states: Dict[AnyStr, Union[List, Set]],
) -> Tuple[float, float]:
    """
    Compare learned CPDs with ideal CPDs

    Args:
        em_cpds: Learned CPDs for different nodes
        sm: Structure model
        data: Input dataset
        true_values_lv: Ideal values of the latent variable
        node_states: Possible tates of different nodes

    Returns:
        Maximum absolute difference and root mean square of differences
    """
    data["z"] = true_values_lv.reshape(-1)
    bn = BayesianNetwork(sm)
    bn.fit_node_states(states_to_df(node_states))
    bn.fit_cpds(data)

    max_delta = -1
    avg_delta = 0

    for node in em_cpds:
        deltas = (em_cpds[node] - bn.cpds[node]).abs().values
        max_delta = max(max_delta, deltas.max())
        avg_delta += np.mean(deltas ** 2)

    avg_delta = np.sqrt(avg_delta / len(em_cpds))
    return max_delta, avg_delta
Exemple #9
0
    def test_roc_of_incorrect_has_fpr_lt_tpr(self):
        """The ROC of incorrect predictions should have FPR < TPR"""

        # regardless of a or b, c=1 is always more likely to varying amounts (to create multiple threshold
        # points in roc curve)
        train = pd.DataFrame(
            [[a, b, 0] for a in range(3) for b in range(3)
             for _ in range(1)] + [[a, b, 1] for a in range(3)
                                   for b in range(3)
                                   for _ in range(a * 1000 + b * 1000 + 1000)],
            columns=["a", "b", "c"],
        )

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        assert np.allclose(bn.cpds["c"].loc[1].values, 1, atol=0.02)

        # in test, c=0 is always more likely (opposite of train)
        test = pd.DataFrame(
            [[a, b, 0] for a in range(3) for b in range(3)
             for _ in range(1000)] + [[a, b, 1] for a in range(3)
                                      for b in range(3) for _ in range(1)],
            columns=["a", "b", "c"],
        )

        roc, _ = roc_auc(bn, test, "c")

        assert len(roc) > 3
        assert all(fpr > tpr for fpr, tpr in roc if tpr not in [0.0, 1.0])
Exemple #10
0
    def test_report_ignores_unrequired_columns_in_data(self, train_data_idx,
                                                       train_data_discrete,
                                                       test_data_c_discrete):
        """Classification report should ignore any columns that are no needed by predict"""

        bn = BayesianNetwork(
            from_pandas(train_data_idx,
                        w_threshold=0.3)).fit_node_states(train_data_discrete)
        train_data_discrete["NEW_COL"] = [1] * len(train_data_discrete)
        bn.fit_cpds(train_data_discrete)
        classification_report(bn, test_data_c_discrete, "c")
Exemple #11
0
    def test_create_inference_with_bad_variable_names_fails(
            self, train_model, train_data_idx):

        model = StructureModel()
        model.add_edges_from([(str(u).replace("a",
                                              "$a"), str(v).replace("a", "$a"))
                              for u, v in train_model.edges])

        train_data_idx.rename(columns={"a": "$a"}, inplace=True)

        bn = BayesianNetwork(model).fit_node_states(train_data_idx)
        bn.fit_cpds(train_data_idx)

        with pytest.raises(ValueError, match="Variable names must match.*"):
            InferenceEngine(bn)
def chain_network() -> BayesianNetwork:
    """
    This Bayesian Model structure to test do interventions that split graph
    into subgraphs.

    a → b → c → d → e
    """
    n = 50
    nodes_names = list("abcde")
    random_binary_matrix = (np.random.randint(10, size=(n, len(nodes_names))) >
                            6).astype(int)
    df = pd.DataFrame(data=random_binary_matrix, columns=nodes_names)

    model = StructureModel()
    model.add_edges_from([
        ("a", "b"),
        ("b", "c"),
        ("c", "d"),
        ("d", "e"),
    ])
    chain_bn = BayesianNetwork(model)
    chain_bn = chain_bn.fit_node_states(df)
    chain_bn = chain_bn.fit_cpds(df,
                                 method="BayesianEstimator",
                                 bayes_prior="K2")
    return chain_bn
def train_bn(data, graph):

    bn = BayesianNetwork(graph)
    bn = bn.fit_node_states(data)
    bn = bn.fit_cpds(data, method='BayesianEstimator', bayes_prior='K2')

    return bn
    def test_fit_missing_states(self):
        """test issues/15: should be possible to fit with missing states"""

        sm = StructureModel([("a", "b"), ("c", "b")])
        bn = BayesianNetwork(sm)

        train = pd.DataFrame(data=[[0, 0, 1], [1, 0, 1], [1, 1, 1]],
                             columns=["a", "b", "c"])
        test = pd.DataFrame(data=[[0, 0, 1], [1, 0, 1], [1, 1, 2]],
                            columns=["a", "b", "c"])
        data = pd.concat([train, test])

        bn.fit_node_states(data)
        bn.fit_cpds(train, method="BayesianEstimator", bayes_prior="K2")

        assert bn.cpds["c"].loc[1][0] == 0.8
        assert bn.cpds["c"].loc[2][0] == 0.2
def get_avg_auc_all_info(
    df: pd.DataFrame,
    bn: BayesianNetwork,
    n_splits: int = 5,
    seed: int = 2021,
    n_cpus: int = multiprocessing.cpu_count() - 1,
) -> float:
    """
    Utility function to compute AUC using all nodes beyond the parent nodes

    Args:
        df: Input dataframe
        bn: Bayesian network
        n_splits: Number of cross-validation folds
        seed: Random seed number
        n_cpus: Number of CPU cores to use

    Returns:
        Average AUC
    """
    bn.fit_node_states(df)
    cv = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    total_auc = 0

    for fold, (train_idx, test_idx) in enumerate(cv.split(df)):
        t0 = time()
        train_df = df.loc[train_idx, :]
        test_df = df.loc[test_idx, :]
        bn.fit_cpds(train_df, method="BayesianEstimator", bayes_prior="K2")
        chunks = [[bn, test_df, target] for target in bn.nodes]

        with multiprocessing.Pool(n_cpus) as p:
            result = p.starmap(_compute_auc_stub, chunks)

        total_auc += sum(result) / len(bn.nodes)
        print(
            f"Processing fold {fold} using {n_cpus} cores takes {time() - t0} seconds"
        )

    return total_auc / n_splits
def get_auc_data(
    df: pd.DataFrame,
    bn: BayesianNetwork,
    n_splits: int = 5,
    seed: int = 2021,
) -> pd.Series:
    """
    Utility function to compute AUC based only on data observations

    Args:
        df: Input dataframe
        bn: Bayesian network
        n_splits: Number of cross-validation folds
        seed: Random seed number

    Returns:
        Average AUC
    """
    bn.fit_node_states(df)
    cv = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    nodes_auc = defaultdict(list)

    for fold, (train_idx, test_idx) in enumerate(cv.split(df)):
        t0 = time()
        train_df = df.loc[train_idx, :]
        test_df = df.loc[test_idx, :]
        bn.fit_cpds(train_df, method="BayesianEstimator", bayes_prior="K2")

        for var in bn.nodes:
            _, auc = roc_auc(bn, test_df, var)
            nodes_auc[var].append(auc)

        print(f"Processing fold {fold} takes {time() - t0} seconds")

    nodes_auc = pd.DataFrame(nodes_auc)
    col = nodes_auc.mean(axis=0).idxmin()
    val = nodes_auc.mean(axis=0).min()
    print(f"Variable with lowest AUC is {col} with the value of {val}")
    return nodes_auc.mean().sort_values()
def get_avg_auc(
    df: pd.DataFrame,
    bn: BayesianNetwork,
    n_splits: int = 5,
    seed: int = 2021,
) -> float:
    """
    Estimate the average auc of all nodes in a Bayesian Network given a structure and a dataset using
    k-fold cross-validation. This function uses the bn.predict method in causalnex and cannot be used
    with latent variable models

    Args:
        df: a dataset in the pandas format
        bn: a bayesian network EM object
        n_splits: Number of folds in k-fold cv
        seed: random seed used in k-fold cv

    Returns:
        Average AUC
    """
    bn.fit_node_states(df)
    cv = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    total_auc = 0

    for fold, (train_idx, test_idx) in enumerate(cv.split(df)):
        t0 = time()
        cur_auc = 0
        train_df = df.loc[train_idx, :]
        test_df = df.loc[test_idx, :]
        bn.fit_cpds(train_df, method="BayesianEstimator", bayes_prior="K2")

        for var in bn.nodes:
            _, auc = roc_auc(bn, test_df, var)
            cur_auc += auc

        print(f"Processing fold {fold} takes {time() - t0} seconds")
        total_auc += cur_auc / len(bn.nodes)

    return total_auc / n_splits
Exemple #18
0
def get_correct_cpds(
    df: pd.DataFrame,
    sm: StructureModel,
    node_states: Dict,
    true_lv_values: np.array,
) -> pd.DataFrame:
    """
    Get the cpds obtained if complete data was provided (no latent variable)

    Args:
        df: Input dataset
        sm: Structure model
        node_states: Dictionary of node states
        true_lv_values: True values of latent variable

    Returns:
        Ground-truth CPDs
    """
    data = df.copy()
    data["z"] = true_lv_values
    bn = BayesianNetwork(sm)
    bn.fit_node_states(states_to_df(node_states))
    bn.fit_cpds(data)
    return bn.cpds
Exemple #19
0
class BayesianNetworkClassifier(BaseEstimator, ClassifierMixin):
    """
    A class that supports discretising features and probability fitting with scikit-learn syntax

    Example:
    ::
        # Dataset is from https://archive.ics.uci.edu/ml/datasets/student+performance
        >>> import pandas as pd
        >>> import numpy as np
        >>> from sklearn.preprocessing import LabelEncoder
        >>> from causalnex.discretiser import Discretiser
        >>> from causalnex.network.sklearn import BayesianNetworkClassifier
        >>> from sklearn.model_selection import train_test_split
        >>> data = pd.read_csv('student-por.csv', delimiter=';')
        >>> drop_col = ['school','sex','age','Mjob', 'Fjob','reason','guardian']
        >>> data = data.drop(columns=drop_col)
        >>> non_numeric_columns = list(data.select_dtypes(exclude=[np.number]).columns)
        >>> le = LabelEncoder()
        >>> for col in non_numeric_columns:
        >>>     data[col] = le.fit_transform(data[col])
        >>> data["G3"] = Discretiser(method="fixed",
                      numeric_split_points=[10]).transform(data["G3"].values)
        >>> label = data["G3"]
        >>> data.drop(['G3'], axis=1, inplace=True)
        >>> X_train, X_test, y_train, y_test = train_test_split(
                        data, label, test_size=0.1, random_state=7)
        >>> edge_list = [('address', 'absences'),
                         ('Pstatus', 'famrel'),
                         ('Pstatus', 'absences'),
                         ('studytime', 'G1'),
                         ('G1', 'G2'),
                         ('failures', 'absences'),
                         ('failures', 'G1'),
                         ('schoolsup', 'G1'),
                         ('paid', 'absences'),
                         ('higher', 'famrel'),
                         ('higher', 'G1'),
                         ('internet', 'absences'),
                         ('G2', 'G3')]
        >>> discretiser_param = {
                'absences': {'method':"fixed",
                             'numeric_split_points':[1, 10]
                            },
                 'G1': {'method':"fixed",
                        'numeric_split_points':[10]
                       },
                 'G2': {'method':"fixed",
                        'numeric_split_points':[10]
                       }
                }
        >>> discretiser_alg = {'absences': 'unsupervised',
                              'G1': 'unsupervised',
                              'G2': 'unsupervised'
                             }
        >>> bayesian_param = {'method':"BayesianEstimator", 'bayes_prior':"K2"}
        >>> clf = BayesianNetworkClassifier(edge_list, discretiser_alg, discretiser_param, bayesian_param)
        >>> clf.fit(X_train, y_train)
        >>> clf.predict(X_test)
        array([1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
               1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1,
               1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0])

    """
    def __init__(
        self,
        list_of_edges: List[Tuple[str]],
        discretiser_alg: Optional[Dict[str, str]] = None,
        discretiser_kwargs: Optional[Dict[str, Dict[str, Any]]] = None,
        probability_kwargs: Dict[str, Dict[str, Any]] = None,
        return_prob: bool = False,
    ):
        """
        Args:
            list_of_edges (list): Edge list to construct graph
            - if True: return pandas dataframe with predicted probability for each state
            - if False: return a 1-D prediction array
            discretiser_alg (dict): Specify a supervised algorithm to discretise
            each feature in the data. Available options for the dictionary values
            are ['unsupervised', 'tree', 'mdlp']
            - if 'unsupervised': discretise the data using unsupervised method
            - if 'tree': discretise the data using decision tree method
            - if 'mdlp': discretise the data using MDLP method
            discretiser_kwargs (dict): Keyword arguments for discretisation methods.
            Only applicable if discretiser_alg is not None.
            probability_kwargs (dict): keyword arguments for the probability model
            return_prob (bool): choose to return predictions or probability

        Raises:
            KeyError: If an incorrect argument is passed
            ValueError: If the keys in discretiser_alg and discretiser_kwargs differ
        """

        probability_kwargs = probability_kwargs or {
            "method": "BayesianEstimator",
            "bayes_prior": "K2",
        }

        if discretiser_alg is None:
            logging.info("No discretiser algorithm was given "
                         "The training data will not be discretised")
            discretiser_alg = {}

        discretiser_kwargs = discretiser_kwargs or {}

        self._validate_discretiser(discretiser_alg, discretiser_kwargs)

        self.list_of_edges = list_of_edges
        self.structure = StructureModel(self.list_of_edges)
        self.bn = BayesianNetwork(self.structure)
        self.return_prob = return_prob
        self.probability_kwargs = probability_kwargs
        self.discretiser_kwargs = discretiser_kwargs
        self.discretiser_alg = discretiser_alg
        self._target_name = None
        self._discretise_data = None

    @staticmethod
    def _validate_discretiser(discretiser_alg, discretiser_kwargs):
        unavailable_discretiser_algs = {
            k: v not in ["unsupervised", "tree", "mdlp"]
            for k, v in discretiser_alg.items()
        }

        if any(unavailable_discretiser_algs.values()):
            algs = {
                k: discretiser_alg[k]
                for k, v in unavailable_discretiser_algs.items() if v
            }
            raise KeyError(
                f"Some discretiser algorithms are not supported: `{algs}`. "
                "Please choose in ['unsupervised', 'tree', 'mdlp']")

        if set(discretiser_kwargs) != set(discretiser_alg):
            raise ValueError(
                "discretiser_alg and discretiser_kwargs should have the same keys"
            )

    def _discretise_features(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Helper method to discretise input data using parameters in
        `discretiser_kwargs` and `discretiser_alg`.
        The splitting thresholds are extracted from the training data

        Args:
            X (pd.DataFrame): a dataframe to be discretised

        Returns:
            a discretised version of the input dataframe
        """

        X = X.copy()

        for col in self.discretiser_alg.keys():

            if self.discretiser_alg[col] == "unsupervised":

                if self.discretiser_kwargs[col]["method"] == "fixed":
                    X[col] = Discretiser(
                        **self.discretiser_kwargs[col]).transform(
                            X[col].values)
                else:
                    discretiser = Discretiser(
                        **self.discretiser_kwargs[col]).fit(
                            self._discretise_data[col].values)
                    X[col] = discretiser.transform(X[col].values)

            else:
                if self.discretiser_alg[col] == "tree":
                    discretiser = DecisionTreeSupervisedDiscretiserMethod(
                        mode="single",
                        tree_params=self.discretiser_kwargs[col])

                elif self.discretiser_alg[col] == "mdlp":
                    discretiser = MDLPSupervisedDiscretiserMethod(
                        self.discretiser_kwargs[col])

                discretiser.fit(
                    dataframe=self._discretise_data,
                    feat_names=[col],
                    target=self._target_name,
                    target_continuous=False,
                )

                X[col] = discretiser.transform(X[[col]])

        return X

    def fit(self, X: pd.DataFrame,
            y: pd.Series) -> "BayesianNetworkClassifier":
        """
        Build a Bayesian Network classifier from a set of training data.
        The method first discretises the feature using parameters in `discretiser_kwargs`
        and `discretiser_alg`. Next, it learns all the possible nodes that each feature
        can have. Finally, it learns the CPDs of the Bayesian Network.

        Args:
            X (pd.DataFrame): input training data
            y (pd.Series): categorical label for each row of X

        Returns:
            self
        """
        self._discretise_data = X.copy()
        self._discretise_data[y.name] = y
        self._target_name = y.name
        X = self._discretise_features(X)

        X[y.name] = y
        self.bn = self.bn.fit_node_states(X)
        self.bn = self.bn.fit_cpds(X, **self.probability_kwargs)

        return self

    def predict(self, X: pd.DataFrame) -> Union[pd.DataFrame, np.ndarray]:
        """
        Return predictions for the input data

        Args:
            X (pd.DataFrame): A dataframe of shape (num_row, num_features) for model to predict

        Returns:
            Model's prediction: A numpy array of shape (num_row,)

        Raises:
            ValueError: if CPDs are empty

        """
        if self.bn.cpds == {}:
            raise ValueError("No CPDs found. The model has not been fitted")

        X = self._discretise_features(X)

        if self.return_prob:
            pred = self.bn.predict_probability(X, self._target_name)
        else:
            pred = self.bn.predict(X, self._target_name).to_numpy().reshape(-1)

        return pred
Exemple #20
0
    def test_create_inference_from_bn(self, train_model, train_data_idx):
        """It should be possible to create a new Inference object from an existing pgmpy model"""

        bn = BayesianNetwork(train_model).fit_node_states(train_data_idx)
        bn.fit_cpds(train_data_idx)
        InferenceEngine(bn)
    def test_em_algorithm(self):  # pylint: disable=too-many-locals
        """
        Test if `BayesianNetwork` works with EM algorithm.
        We use a naive bayes + parents + an extra node not related to the latent variable.
        """

        # p0   p1  p2
        #   \  |  /
        #      z
        #   /  |  \
        # c0  c1  c2
        # |
        # cc0
        np.random.seed(22)

        data, sm, _, true_lv_values = naive_bayes_plus_parents(
            percentage_not_missing=0.1,
            samples=1000,
            p_z=0.7,
            p_c=0.7,
        )
        data["cc_0"] = np.where(
            np.random.random(len(data)) < 0.5, data["c_0"],
            (data["c_0"] + 1) % 3)
        data.drop(columns=["z"], inplace=True)

        complete_data = data.copy(deep=True)
        complete_data["z"] = true_lv_values

        # Baseline model: the structure of the figure trained with complete data. We try to reproduce it
        complete_bn = BayesianNetwork(
            StructureModel(list(sm.edges) + [("c_0", "cc_0")]))
        complete_bn.fit_node_states_and_cpds(complete_data)

        # BN without latent variable: All `p`s are connected to all `c`s + `c0` ->`cc0`
        sm_no_lv = StructureModel([(f"p_{p}", f"c_{c}") for p in range(3)
                                   for c in range(3)] + [("c_0", "cc_0")])
        bn = BayesianNetwork(sm_no_lv)
        bn.fit_node_states(data)
        bn.fit_cpds(data)

        # TEST 1: cc_0 does not depend on the latent variable so:
        assert np.all(bn.cpds["cc_0"] == complete_bn.cpds["cc_0"])

        # BN with latent variable
        # When we add the latent variable, we add the edges in the image above
        # and remove the connection among `p`s and `c`s
        edges_to_add = list(sm.edges)
        edges_to_remove = [(f"p_{p}", f"c_{c}") for p in range(3)
                           for c in range(3)]
        bn.add_node("z", edges_to_add, edges_to_remove)
        bn.fit_latent_cpds("z", [0, 1, 2], data, stopping_delta=0.001)

        # TEST 2: cc_0 CPD should remain untouched by the EM algorithm
        assert np.all(bn.cpds["cc_0"] == complete_bn.cpds["cc_0"])

        # TEST 3: We should recover the correct CPDs quite accurately
        assert bn.cpds.keys() == complete_bn.cpds.keys()
        assert self.mean_absolute_error(bn.cpds, complete_bn.cpds) < 0.01

        # TEST 4: Inference over recovered CPDs should be also accurate
        eng = InferenceEngine(bn)
        query = eng.query()
        n_rows = complete_data.shape[0]

        for node in query:
            assert (np.abs(query[node][0] -
                           sum(complete_data[node] == 0) / n_rows) < 1e-2)
            assert (np.abs(query[node][1] -
                           sum(complete_data[node] == 1) / n_rows) < 1e-2)

        # TEST 5: Inference using predict and predict_probability functions
        report = classification_report(bn, complete_data, "z")
        _, auc = roc_auc(bn, complete_data, "z")
        complete_report = classification_report(complete_bn, complete_data,
                                                "z")
        _, complete_auc = roc_auc(complete_bn, complete_data, "z")

        for category, metrics in report.items():
            if isinstance(metrics, dict):
                for key, val in metrics.items():
                    assert np.abs(val - complete_report[category][key]) < 1e-2
            else:
                assert np.abs(metrics - complete_report[category]) < 1e-2

        assert np.abs(auc - complete_auc) < 1e-2
sm.add_edge("failures", "G1")
sm.remove_edge("Pstatus", "G1")
sm.remove_edge("address", "G1")

sm = sm.get_largest_subgraph()

end = time.time() - start
print(int(end))


# 베이지안 네트워크 모델 선언
bn = BayesianNetwork(sm)
bn = bn.fit_node_states(discretised_data)

# 조건부 확률 분포 (CPDS: Conditional Probability Distributions) 핏팅
bn = bn.fit_cpds(train, method="BayesianEstimator", bayes_prior="K2")

# 타겟 확인
print(bn.cpds["G1"])  # 시험 G1 성적 - Pass/Fail

# 타겟을 제외한 인풋(18번째 row) 확인
print(discretised_data.loc[18, discretised_data.columns != 'G1'])


# 예측
predictions = bn.predict(discretised_data, "G1")
print('The prediction is \'{prediction}\''.format(prediction=predictions.loc[18, 'G1_prediction']))
print('The ground truth is \'{truth}\''.format(truth=discretised_data.loc[18, 'G1']))

# 평가
classification_report(bn, test, "G1")