def classification_report(bn: BayesianNetwork, data: pd.DataFrame, node: str) -> pd.DataFrame: """ Build a report showing the main classification metrics. Args: bn (BayesianNetwork): model to compute classification report using. data (pd.DataFrame): test data that will be used for predictions. node (str): name of the variable to generate report for. Returns: Text summary of the precision, recall, F1 score for each class. The reported averages include micro average (averaging the total true positives, false negatives and false positives), macro average (averaging the unweighted mean per label), weighted average (averaging the support-weighted mean per label) and sample average (only for multilabel classification). Note that in binary classification, recall of the positive class is also known as "sensitivity"; recall of the negative class is "specificity". Example: :: >>> from causalnex.structure import StructureModel >>> from causalnex.network import BayesianNetwork >>> >>> sm = StructureModel() >>> sm.add_edges_from([ >>> ('rush_hour', 'traffic'), >>> ('weather', 'traffic') >>> ]) >>> bn = BayesianNetwork(sm) >>> import pandas as pd >>> data = pd.DataFrame({ >>> 'rush_hour': [True, False, False, False, True, False, True], >>> 'weather': ['Terrible', 'Good', 'Bad', 'Good', 'Bad', 'Bad', 'Good'], >>> 'traffic': ['heavy', 'light', 'heavy', 'light', 'heavy', 'heavy', 'heavy'] >>> } >>> bn = bn.fit_node_states_and_cpds(data) >>> test_data = pd.DataFrame({ >>> 'rush_hour': [False, False, True, True], >>> 'weather': ['Good', 'Bad', 'Good', 'Bad'], >>> 'traffic': ['light', 'heavy', 'heavy', 'light'] >>> }) >>> from causalnex.evaluation import classification_report >>> classification_report(bn, test_data, "traffic").to_dict() {'precision': { 'macro avg': 0.8333333333333333, 'micro avg': 0.75, 'traffic_heavy': 0.6666666666666666, 'traffic_light': 1.0, 'weighted avg': 0.8333333333333333 }, 'recall': { 'macro avg': 0.75, 'micro avg': 0.75, 'traffic_heavy': 1.0, 'traffic_light': 0.5, 'weighted avg': 0.75 }, 'f1-score': { 'macro avg': 0.7333333333333334, 'micro avg': 0.75, 'traffic_heavy': 0.8, 'traffic_light': 0.6666666666666666, 'weighted avg': 0.7333333333333334 }, 'support': { 'macro avg': 4, 'micro avg': 4, 'traffic_heavy': 2, 'traffic_light': 2, 'weighted avg': 4 }} """ predictions = bn.predict(data, node) labels = sorted(list(bn.node_states[node])) target_names = [ "{0}_{1}".format(node, str(v)) for v in sorted(bn.node_states[node]) ] report = metrics.classification_report( y_true=data[node], y_pred=predictions, labels=labels, target_names=target_names, output_dict=True, ) return pd.DataFrame.from_dict(report, orient="index")
class BayesianNetworkClassifier(BaseEstimator, ClassifierMixin): """ A class that supports discretising features and probability fitting with scikit-learn syntax Example: :: # Dataset is from https://archive.ics.uci.edu/ml/datasets/student+performance >>> import pandas as pd >>> import numpy as np >>> from sklearn.preprocessing import LabelEncoder >>> from causalnex.discretiser import Discretiser >>> from causalnex.network.sklearn import BayesianNetworkClassifier >>> from sklearn.model_selection import train_test_split >>> data = pd.read_csv('student-por.csv', delimiter=';') >>> drop_col = ['school','sex','age','Mjob', 'Fjob','reason','guardian'] >>> data = data.drop(columns=drop_col) >>> non_numeric_columns = list(data.select_dtypes(exclude=[np.number]).columns) >>> le = LabelEncoder() >>> for col in non_numeric_columns: >>> data[col] = le.fit_transform(data[col]) >>> data["G3"] = Discretiser(method="fixed", numeric_split_points=[10]).transform(data["G3"].values) >>> label = data["G3"] >>> data.drop(['G3'], axis=1, inplace=True) >>> X_train, X_test, y_train, y_test = train_test_split( data, label, test_size=0.1, random_state=7) >>> edge_list = [('address', 'absences'), ('Pstatus', 'famrel'), ('Pstatus', 'absences'), ('studytime', 'G1'), ('G1', 'G2'), ('failures', 'absences'), ('failures', 'G1'), ('schoolsup', 'G1'), ('paid', 'absences'), ('higher', 'famrel'), ('higher', 'G1'), ('internet', 'absences'), ('G2', 'G3')] >>> discretiser_param = { 'absences': {'method':"fixed", 'numeric_split_points':[1, 10] }, 'G1': {'method':"fixed", 'numeric_split_points':[10] }, 'G2': {'method':"fixed", 'numeric_split_points':[10] } } >>> discretiser_alg = {'absences': 'unsupervised', 'G1': 'unsupervised', 'G2': 'unsupervised' } >>> bayesian_param = {'method':"BayesianEstimator", 'bayes_prior':"K2"} >>> clf = BayesianNetworkClassifier(edge_list, discretiser_alg, discretiser_param, bayesian_param) >>> clf.fit(X_train, y_train) >>> clf.predict(X_test) array([1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0]) """ def __init__( self, list_of_edges: List[Tuple[str]], discretiser_alg: Optional[Dict[str, str]] = None, discretiser_kwargs: Optional[Dict[str, Dict[str, Any]]] = None, probability_kwargs: Dict[str, Dict[str, Any]] = None, return_prob: bool = False, ): """ Args: list_of_edges (list): Edge list to construct graph - if True: return pandas dataframe with predicted probability for each state - if False: return a 1-D prediction array discretiser_alg (dict): Specify a supervised algorithm to discretise each feature in the data. Available options for the dictionary values are ['unsupervised', 'tree', 'mdlp'] - if 'unsupervised': discretise the data using unsupervised method - if 'tree': discretise the data using decision tree method - if 'mdlp': discretise the data using MDLP method discretiser_kwargs (dict): Keyword arguments for discretisation methods. Only applicable if discretiser_alg is not None. probability_kwargs (dict): keyword arguments for the probability model return_prob (bool): choose to return predictions or probability Raises: KeyError: If an incorrect argument is passed ValueError: If the keys in discretiser_alg and discretiser_kwargs differ """ probability_kwargs = probability_kwargs or { "method": "BayesianEstimator", "bayes_prior": "K2", } if discretiser_alg is None: logging.info("No discretiser algorithm was given " "The training data will not be discretised") discretiser_alg = {} discretiser_kwargs = discretiser_kwargs or {} self._validate_discretiser(discretiser_alg, discretiser_kwargs) self.list_of_edges = list_of_edges self.structure = StructureModel(self.list_of_edges) self.bn = BayesianNetwork(self.structure) self.return_prob = return_prob self.probability_kwargs = probability_kwargs self.discretiser_kwargs = discretiser_kwargs self.discretiser_alg = discretiser_alg self._target_name = None self._discretise_data = None @staticmethod def _validate_discretiser(discretiser_alg, discretiser_kwargs): unavailable_discretiser_algs = { k: v not in ["unsupervised", "tree", "mdlp"] for k, v in discretiser_alg.items() } if any(unavailable_discretiser_algs.values()): algs = { k: discretiser_alg[k] for k, v in unavailable_discretiser_algs.items() if v } raise KeyError( f"Some discretiser algorithms are not supported: `{algs}`. " "Please choose in ['unsupervised', 'tree', 'mdlp']") if set(discretiser_kwargs) != set(discretiser_alg): raise ValueError( "discretiser_alg and discretiser_kwargs should have the same keys" ) def _discretise_features(self, X: pd.DataFrame) -> pd.DataFrame: """ Helper method to discretise input data using parameters in `discretiser_kwargs` and `discretiser_alg`. The splitting thresholds are extracted from the training data Args: X (pd.DataFrame): a dataframe to be discretised Returns: a discretised version of the input dataframe """ X = X.copy() for col in self.discretiser_alg.keys(): if self.discretiser_alg[col] == "unsupervised": if self.discretiser_kwargs[col]["method"] == "fixed": X[col] = Discretiser( **self.discretiser_kwargs[col]).transform( X[col].values) else: discretiser = Discretiser( **self.discretiser_kwargs[col]).fit( self._discretise_data[col].values) X[col] = discretiser.transform(X[col].values) else: if self.discretiser_alg[col] == "tree": discretiser = DecisionTreeSupervisedDiscretiserMethod( mode="single", tree_params=self.discretiser_kwargs[col]) elif self.discretiser_alg[col] == "mdlp": discretiser = MDLPSupervisedDiscretiserMethod( self.discretiser_kwargs[col]) discretiser.fit( dataframe=self._discretise_data, feat_names=[col], target=self._target_name, target_continuous=False, ) X[col] = discretiser.transform(X[[col]]) return X def fit(self, X: pd.DataFrame, y: pd.Series) -> "BayesianNetworkClassifier": """ Build a Bayesian Network classifier from a set of training data. The method first discretises the feature using parameters in `discretiser_kwargs` and `discretiser_alg`. Next, it learns all the possible nodes that each feature can have. Finally, it learns the CPDs of the Bayesian Network. Args: X (pd.DataFrame): input training data y (pd.Series): categorical label for each row of X Returns: self """ self._discretise_data = X.copy() self._discretise_data[y.name] = y self._target_name = y.name X = self._discretise_features(X) X[y.name] = y self.bn = self.bn.fit_node_states(X) self.bn = self.bn.fit_cpds(X, **self.probability_kwargs) return self def predict(self, X: pd.DataFrame) -> Union[pd.DataFrame, np.ndarray]: """ Return predictions for the input data Args: X (pd.DataFrame): A dataframe of shape (num_row, num_features) for model to predict Returns: Model's prediction: A numpy array of shape (num_row,) Raises: ValueError: if CPDs are empty """ if self.bn.cpds == {}: raise ValueError("No CPDs found. The model has not been fitted") X = self._discretise_features(X) if self.return_prob: pred = self.bn.predict_probability(X, self._target_name) else: pred = self.bn.predict(X, self._target_name).to_numpy().reshape(-1) return pred
# 베이지안 네트워크 모델 선언 bn = BayesianNetwork(sm) bn = bn.fit_node_states(discretised_data) # 조건부 확률 분포 (CPDS: Conditional Probability Distributions) 핏팅 bn = bn.fit_cpds(train, method="BayesianEstimator", bayes_prior="K2") # 타겟 확인 print(bn.cpds["G1"]) # 시험 G1 성적 - Pass/Fail # 타겟을 제외한 인풋(18번째 row) 확인 print(discretised_data.loc[18, discretised_data.columns != 'G1']) # 예측 predictions = bn.predict(discretised_data, "G1") print('The prediction is \'{prediction}\''.format(prediction=predictions.loc[18, 'G1_prediction'])) print('The ground truth is \'{truth}\''.format(truth=discretised_data.loc[18, 'G1'])) # 평가 classification_report(bn, test, "G1") roc, auc = roc_auc(bn, test, "G1") print(auc) # 한계(Marginal) 확률 베이스라인 (위와 같음) bn = bn.fit_cpds(discretised_data, method="BayesianEstimator", bayes_prior="K2") # 모든 상태와 노드에 대해서 한계(Marginal) 우도(Likelihood) 계산 ie = InferenceEngine(bn)