def roc_auc(bn: BayesianNetwork, data: pd.DataFrame, node: str) -> Tuple[List[Tuple[float, float]], float]: """ Build a report of the micro-average Receiver-Operating Characteristics (ROC), and the Area Under the ROC curve Micro-average computes roc_auc over all predictions for all states of node. Args: bn (BayesianNetwork): model to compute roc_auc. data (pd.DataFrame): test data that will be used to calculate ROC. node (str): name of the variable to generate the report for. Returns: roc - auc tuple - roc (List[Tuple[float, float]]): list of [(fpr, tpr)] observations. - auc float: auc for the node predictions. Example: :: >>> from causalnex.structure import StructureModel >>> from causalnex.network import BayesianNetwork >>> >>> sm = StructureModel() >>> sm.add_edges_from([ >>> ('rush_hour', 'traffic'), >>> ('weather', 'traffic') >>> ]) >>> bn = BayesianNetwork(sm) >>> import pandas as pd >>> data = pd.DataFrame({ >>> 'rush_hour': [True, False, False, False, True, False, True], >>> 'weather': ['Terrible', 'Good', 'Bad', 'Good', 'Bad', 'Bad', 'Good'], >>> 'traffic': ['heavy', 'light', 'heavy', 'light', 'heavy', 'heavy', 'heavy'] >>> } >>> bn = bn.fit_node_states_and_cpds(data) >>> test_data = pd.DataFrame({ >>> 'rush_hour': [False, False, True, True], >>> 'weather': ['Good', 'Bad', 'Good', 'Bad'], >>> 'traffic': ['light', 'heavy', 'heavy', 'light'] >>> }) >>> from causalnex.evaluation import roc_auc >>> roc, auc = roc_auc(bn, test_data, "traffic") >>> print(auc) 0.75 """ ground_truth = _build_ground_truth(bn, data, node) predictions = bn.predict_probability(data, node) # update column names to match those of ground_truth predictions.rename(columns=lambda x: x.lstrip(node + "_"), inplace=True) predictions = predictions[sorted(predictions.columns)] fpr, tpr, _ = metrics.roc_curve(ground_truth.values.ravel(), predictions.values.ravel()) roc = list(zip(fpr, tpr)) auc = metrics.auc(fpr, tpr) return roc, auc
class BayesianNetworkClassifier(BaseEstimator, ClassifierMixin): """ A class that supports discretising features and probability fitting with scikit-learn syntax Example: :: # Dataset is from https://archive.ics.uci.edu/ml/datasets/student+performance >>> import pandas as pd >>> import numpy as np >>> from sklearn.preprocessing import LabelEncoder >>> from causalnex.discretiser import Discretiser >>> from causalnex.network.sklearn import BayesianNetworkClassifier >>> from sklearn.model_selection import train_test_split >>> data = pd.read_csv('student-por.csv', delimiter=';') >>> drop_col = ['school','sex','age','Mjob', 'Fjob','reason','guardian'] >>> data = data.drop(columns=drop_col) >>> non_numeric_columns = list(data.select_dtypes(exclude=[np.number]).columns) >>> le = LabelEncoder() >>> for col in non_numeric_columns: >>> data[col] = le.fit_transform(data[col]) >>> data["G3"] = Discretiser(method="fixed", numeric_split_points=[10]).transform(data["G3"].values) >>> label = data["G3"] >>> data.drop(['G3'], axis=1, inplace=True) >>> X_train, X_test, y_train, y_test = train_test_split( data, label, test_size=0.1, random_state=7) >>> edge_list = [('address', 'absences'), ('Pstatus', 'famrel'), ('Pstatus', 'absences'), ('studytime', 'G1'), ('G1', 'G2'), ('failures', 'absences'), ('failures', 'G1'), ('schoolsup', 'G1'), ('paid', 'absences'), ('higher', 'famrel'), ('higher', 'G1'), ('internet', 'absences'), ('G2', 'G3')] >>> discretiser_param = { 'absences': {'method':"fixed", 'numeric_split_points':[1, 10] }, 'G1': {'method':"fixed", 'numeric_split_points':[10] }, 'G2': {'method':"fixed", 'numeric_split_points':[10] } } >>> discretiser_alg = {'absences': 'unsupervised', 'G1': 'unsupervised', 'G2': 'unsupervised' } >>> bayesian_param = {'method':"BayesianEstimator", 'bayes_prior':"K2"} >>> clf = BayesianNetworkClassifier(edge_list, discretiser_alg, discretiser_param, bayesian_param) >>> clf.fit(X_train, y_train) >>> clf.predict(X_test) array([1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0]) """ def __init__( self, list_of_edges: List[Tuple[str]], discretiser_alg: Optional[Dict[str, str]] = None, discretiser_kwargs: Optional[Dict[str, Dict[str, Any]]] = None, probability_kwargs: Dict[str, Dict[str, Any]] = None, return_prob: bool = False, ): """ Args: list_of_edges (list): Edge list to construct graph - if True: return pandas dataframe with predicted probability for each state - if False: return a 1-D prediction array discretiser_alg (dict): Specify a supervised algorithm to discretise each feature in the data. Available options for the dictionary values are ['unsupervised', 'tree', 'mdlp'] - if 'unsupervised': discretise the data using unsupervised method - if 'tree': discretise the data using decision tree method - if 'mdlp': discretise the data using MDLP method discretiser_kwargs (dict): Keyword arguments for discretisation methods. Only applicable if discretiser_alg is not None. probability_kwargs (dict): keyword arguments for the probability model return_prob (bool): choose to return predictions or probability Raises: KeyError: If an incorrect argument is passed ValueError: If the keys in discretiser_alg and discretiser_kwargs differ """ probability_kwargs = probability_kwargs or { "method": "BayesianEstimator", "bayes_prior": "K2", } if discretiser_alg is None: logging.info("No discretiser algorithm was given " "The training data will not be discretised") discretiser_alg = {} discretiser_kwargs = discretiser_kwargs or {} self._validate_discretiser(discretiser_alg, discretiser_kwargs) self.list_of_edges = list_of_edges self.structure = StructureModel(self.list_of_edges) self.bn = BayesianNetwork(self.structure) self.return_prob = return_prob self.probability_kwargs = probability_kwargs self.discretiser_kwargs = discretiser_kwargs self.discretiser_alg = discretiser_alg self._target_name = None self._discretise_data = None @staticmethod def _validate_discretiser(discretiser_alg, discretiser_kwargs): unavailable_discretiser_algs = { k: v not in ["unsupervised", "tree", "mdlp"] for k, v in discretiser_alg.items() } if any(unavailable_discretiser_algs.values()): algs = { k: discretiser_alg[k] for k, v in unavailable_discretiser_algs.items() if v } raise KeyError( f"Some discretiser algorithms are not supported: `{algs}`. " "Please choose in ['unsupervised', 'tree', 'mdlp']") if set(discretiser_kwargs) != set(discretiser_alg): raise ValueError( "discretiser_alg and discretiser_kwargs should have the same keys" ) def _discretise_features(self, X: pd.DataFrame) -> pd.DataFrame: """ Helper method to discretise input data using parameters in `discretiser_kwargs` and `discretiser_alg`. The splitting thresholds are extracted from the training data Args: X (pd.DataFrame): a dataframe to be discretised Returns: a discretised version of the input dataframe """ X = X.copy() for col in self.discretiser_alg.keys(): if self.discretiser_alg[col] == "unsupervised": if self.discretiser_kwargs[col]["method"] == "fixed": X[col] = Discretiser( **self.discretiser_kwargs[col]).transform( X[col].values) else: discretiser = Discretiser( **self.discretiser_kwargs[col]).fit( self._discretise_data[col].values) X[col] = discretiser.transform(X[col].values) else: if self.discretiser_alg[col] == "tree": discretiser = DecisionTreeSupervisedDiscretiserMethod( mode="single", tree_params=self.discretiser_kwargs[col]) elif self.discretiser_alg[col] == "mdlp": discretiser = MDLPSupervisedDiscretiserMethod( self.discretiser_kwargs[col]) discretiser.fit( dataframe=self._discretise_data, feat_names=[col], target=self._target_name, target_continuous=False, ) X[col] = discretiser.transform(X[[col]]) return X def fit(self, X: pd.DataFrame, y: pd.Series) -> "BayesianNetworkClassifier": """ Build a Bayesian Network classifier from a set of training data. The method first discretises the feature using parameters in `discretiser_kwargs` and `discretiser_alg`. Next, it learns all the possible nodes that each feature can have. Finally, it learns the CPDs of the Bayesian Network. Args: X (pd.DataFrame): input training data y (pd.Series): categorical label for each row of X Returns: self """ self._discretise_data = X.copy() self._discretise_data[y.name] = y self._target_name = y.name X = self._discretise_features(X) X[y.name] = y self.bn = self.bn.fit_node_states(X) self.bn = self.bn.fit_cpds(X, **self.probability_kwargs) return self def predict(self, X: pd.DataFrame) -> Union[pd.DataFrame, np.ndarray]: """ Return predictions for the input data Args: X (pd.DataFrame): A dataframe of shape (num_row, num_features) for model to predict Returns: Model's prediction: A numpy array of shape (num_row,) Raises: ValueError: if CPDs are empty """ if self.bn.cpds == {}: raise ValueError("No CPDs found. The model has not been fitted") X = self._discretise_features(X) if self.return_prob: pred = self.bn.predict_probability(X, self._target_name) else: pred = self.bn.predict(X, self._target_name).to_numpy().reshape(-1) return pred