Example #1
0
def classification_report(bn: BayesianNetwork, data: pd.DataFrame,
                          node: str) -> pd.DataFrame:
    """
    Build a report showing the main classification metrics.

    Args:
        bn (BayesianNetwork): model to compute classification report using.
        data (pd.DataFrame): test data that will be used for predictions.
        node (str): name of the variable to generate report for.

    Returns:
        Text summary of the precision, recall, F1 score for each class.

        The reported averages include micro average (averaging the
        total true positives, false negatives and false positives), macro
        average (averaging the unweighted mean per label), weighted average
        (averaging the support-weighted mean per label) and sample average
        (only for multilabel classification).

        Note that in binary classification, recall of the positive class
        is also known as "sensitivity"; recall of the negative class is
        "specificity".

    Example:
    ::
        >>> from causalnex.structure import StructureModel
        >>> from causalnex.network import BayesianNetwork
        >>>
        >>> sm = StructureModel()
        >>> sm.add_edges_from([
        >>>                    ('rush_hour', 'traffic'),
        >>>                    ('weather', 'traffic')
        >>>                    ])
        >>> bn = BayesianNetwork(sm)
        >>> import pandas as pd
        >>> data = pd.DataFrame({
        >>>                      'rush_hour': [True, False, False, False, True, False, True],
        >>>                      'weather': ['Terrible', 'Good', 'Bad', 'Good', 'Bad', 'Bad', 'Good'],
        >>>                      'traffic': ['heavy', 'light', 'heavy', 'light', 'heavy', 'heavy', 'heavy']
        >>>                      }
        >>> bn = bn.fit_node_states_and_cpds(data)
        >>> test_data = pd.DataFrame({
        >>>                         'rush_hour': [False, False, True, True],
        >>>                         'weather': ['Good', 'Bad', 'Good', 'Bad'],
        >>>                         'traffic': ['light', 'heavy', 'heavy', 'light']
        >>>                         })
        >>> from causalnex.evaluation import classification_report
        >>> classification_report(bn, test_data, "traffic").to_dict()
        {'precision': {
            'macro avg': 0.8333333333333333, 'micro avg': 0.75,
            'traffic_heavy': 0.6666666666666666,
            'traffic_light': 1.0,
            'weighted avg': 0.8333333333333333
          },
         'recall': {
            'macro avg': 0.75,
            'micro avg': 0.75,
            'traffic_heavy': 1.0,
            'traffic_light': 0.5,
            'weighted avg': 0.75
          },
         'f1-score': {
            'macro avg': 0.7333333333333334,
            'micro avg': 0.75,
            'traffic_heavy': 0.8,
            'traffic_light': 0.6666666666666666,
            'weighted avg': 0.7333333333333334
          },
         'support': {
            'macro avg': 4,
            'micro avg': 4,
            'traffic_heavy': 2,
            'traffic_light': 2,
            'weighted avg': 4
          }}
    """

    predictions = bn.predict(data, node)

    labels = sorted(list(bn.node_states[node]))
    target_names = [
        "{0}_{1}".format(node, str(v)) for v in sorted(bn.node_states[node])
    ]
    report = metrics.classification_report(
        y_true=data[node],
        y_pred=predictions,
        labels=labels,
        target_names=target_names,
        output_dict=True,
    )

    return pd.DataFrame.from_dict(report, orient="index")
Example #2
0
class BayesianNetworkClassifier(BaseEstimator, ClassifierMixin):
    """
    A class that supports discretising features and probability fitting with scikit-learn syntax

    Example:
    ::
        # Dataset is from https://archive.ics.uci.edu/ml/datasets/student+performance
        >>> import pandas as pd
        >>> import numpy as np
        >>> from sklearn.preprocessing import LabelEncoder
        >>> from causalnex.discretiser import Discretiser
        >>> from causalnex.network.sklearn import BayesianNetworkClassifier
        >>> from sklearn.model_selection import train_test_split
        >>> data = pd.read_csv('student-por.csv', delimiter=';')
        >>> drop_col = ['school','sex','age','Mjob', 'Fjob','reason','guardian']
        >>> data = data.drop(columns=drop_col)
        >>> non_numeric_columns = list(data.select_dtypes(exclude=[np.number]).columns)
        >>> le = LabelEncoder()
        >>> for col in non_numeric_columns:
        >>>     data[col] = le.fit_transform(data[col])
        >>> data["G3"] = Discretiser(method="fixed",
                      numeric_split_points=[10]).transform(data["G3"].values)
        >>> label = data["G3"]
        >>> data.drop(['G3'], axis=1, inplace=True)
        >>> X_train, X_test, y_train, y_test = train_test_split(
                        data, label, test_size=0.1, random_state=7)
        >>> edge_list = [('address', 'absences'),
                         ('Pstatus', 'famrel'),
                         ('Pstatus', 'absences'),
                         ('studytime', 'G1'),
                         ('G1', 'G2'),
                         ('failures', 'absences'),
                         ('failures', 'G1'),
                         ('schoolsup', 'G1'),
                         ('paid', 'absences'),
                         ('higher', 'famrel'),
                         ('higher', 'G1'),
                         ('internet', 'absences'),
                         ('G2', 'G3')]
        >>> discretiser_param = {
                'absences': {'method':"fixed",
                             'numeric_split_points':[1, 10]
                            },
                 'G1': {'method':"fixed",
                        'numeric_split_points':[10]
                       },
                 'G2': {'method':"fixed",
                        'numeric_split_points':[10]
                       }
                }
        >>> discretiser_alg = {'absences': 'unsupervised',
                              'G1': 'unsupervised',
                              'G2': 'unsupervised'
                             }
        >>> bayesian_param = {'method':"BayesianEstimator", 'bayes_prior':"K2"}
        >>> clf = BayesianNetworkClassifier(edge_list, discretiser_alg, discretiser_param, bayesian_param)
        >>> clf.fit(X_train, y_train)
        >>> clf.predict(X_test)
        array([1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
               1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1,
               1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0])

    """
    def __init__(
        self,
        list_of_edges: List[Tuple[str]],
        discretiser_alg: Optional[Dict[str, str]] = None,
        discretiser_kwargs: Optional[Dict[str, Dict[str, Any]]] = None,
        probability_kwargs: Dict[str, Dict[str, Any]] = None,
        return_prob: bool = False,
    ):
        """
        Args:
            list_of_edges (list): Edge list to construct graph
            - if True: return pandas dataframe with predicted probability for each state
            - if False: return a 1-D prediction array
            discretiser_alg (dict): Specify a supervised algorithm to discretise
            each feature in the data. Available options for the dictionary values
            are ['unsupervised', 'tree', 'mdlp']
            - if 'unsupervised': discretise the data using unsupervised method
            - if 'tree': discretise the data using decision tree method
            - if 'mdlp': discretise the data using MDLP method
            discretiser_kwargs (dict): Keyword arguments for discretisation methods.
            Only applicable if discretiser_alg is not None.
            probability_kwargs (dict): keyword arguments for the probability model
            return_prob (bool): choose to return predictions or probability

        Raises:
            KeyError: If an incorrect argument is passed
            ValueError: If the keys in discretiser_alg and discretiser_kwargs differ
        """

        probability_kwargs = probability_kwargs or {
            "method": "BayesianEstimator",
            "bayes_prior": "K2",
        }

        if discretiser_alg is None:
            logging.info("No discretiser algorithm was given "
                         "The training data will not be discretised")
            discretiser_alg = {}

        discretiser_kwargs = discretiser_kwargs or {}

        self._validate_discretiser(discretiser_alg, discretiser_kwargs)

        self.list_of_edges = list_of_edges
        self.structure = StructureModel(self.list_of_edges)
        self.bn = BayesianNetwork(self.structure)
        self.return_prob = return_prob
        self.probability_kwargs = probability_kwargs
        self.discretiser_kwargs = discretiser_kwargs
        self.discretiser_alg = discretiser_alg
        self._target_name = None
        self._discretise_data = None

    @staticmethod
    def _validate_discretiser(discretiser_alg, discretiser_kwargs):
        unavailable_discretiser_algs = {
            k: v not in ["unsupervised", "tree", "mdlp"]
            for k, v in discretiser_alg.items()
        }

        if any(unavailable_discretiser_algs.values()):
            algs = {
                k: discretiser_alg[k]
                for k, v in unavailable_discretiser_algs.items() if v
            }
            raise KeyError(
                f"Some discretiser algorithms are not supported: `{algs}`. "
                "Please choose in ['unsupervised', 'tree', 'mdlp']")

        if set(discretiser_kwargs) != set(discretiser_alg):
            raise ValueError(
                "discretiser_alg and discretiser_kwargs should have the same keys"
            )

    def _discretise_features(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Helper method to discretise input data using parameters in
        `discretiser_kwargs` and `discretiser_alg`.
        The splitting thresholds are extracted from the training data

        Args:
            X (pd.DataFrame): a dataframe to be discretised

        Returns:
            a discretised version of the input dataframe
        """

        X = X.copy()

        for col in self.discretiser_alg.keys():

            if self.discretiser_alg[col] == "unsupervised":

                if self.discretiser_kwargs[col]["method"] == "fixed":
                    X[col] = Discretiser(
                        **self.discretiser_kwargs[col]).transform(
                            X[col].values)
                else:
                    discretiser = Discretiser(
                        **self.discretiser_kwargs[col]).fit(
                            self._discretise_data[col].values)
                    X[col] = discretiser.transform(X[col].values)

            else:
                if self.discretiser_alg[col] == "tree":
                    discretiser = DecisionTreeSupervisedDiscretiserMethod(
                        mode="single",
                        tree_params=self.discretiser_kwargs[col])

                elif self.discretiser_alg[col] == "mdlp":
                    discretiser = MDLPSupervisedDiscretiserMethod(
                        self.discretiser_kwargs[col])

                discretiser.fit(
                    dataframe=self._discretise_data,
                    feat_names=[col],
                    target=self._target_name,
                    target_continuous=False,
                )

                X[col] = discretiser.transform(X[[col]])

        return X

    def fit(self, X: pd.DataFrame,
            y: pd.Series) -> "BayesianNetworkClassifier":
        """
        Build a Bayesian Network classifier from a set of training data.
        The method first discretises the feature using parameters in `discretiser_kwargs`
        and `discretiser_alg`. Next, it learns all the possible nodes that each feature
        can have. Finally, it learns the CPDs of the Bayesian Network.

        Args:
            X (pd.DataFrame): input training data
            y (pd.Series): categorical label for each row of X

        Returns:
            self
        """
        self._discretise_data = X.copy()
        self._discretise_data[y.name] = y
        self._target_name = y.name
        X = self._discretise_features(X)

        X[y.name] = y
        self.bn = self.bn.fit_node_states(X)
        self.bn = self.bn.fit_cpds(X, **self.probability_kwargs)

        return self

    def predict(self, X: pd.DataFrame) -> Union[pd.DataFrame, np.ndarray]:
        """
        Return predictions for the input data

        Args:
            X (pd.DataFrame): A dataframe of shape (num_row, num_features) for model to predict

        Returns:
            Model's prediction: A numpy array of shape (num_row,)

        Raises:
            ValueError: if CPDs are empty

        """
        if self.bn.cpds == {}:
            raise ValueError("No CPDs found. The model has not been fitted")

        X = self._discretise_features(X)

        if self.return_prob:
            pred = self.bn.predict_probability(X, self._target_name)
        else:
            pred = self.bn.predict(X, self._target_name).to_numpy().reshape(-1)

        return pred
# 베이지안 네트워크 모델 선언
bn = BayesianNetwork(sm)
bn = bn.fit_node_states(discretised_data)

# 조건부 확률 분포 (CPDS: Conditional Probability Distributions) 핏팅
bn = bn.fit_cpds(train, method="BayesianEstimator", bayes_prior="K2")

# 타겟 확인
print(bn.cpds["G1"])  # 시험 G1 성적 - Pass/Fail

# 타겟을 제외한 인풋(18번째 row) 확인
print(discretised_data.loc[18, discretised_data.columns != 'G1'])


# 예측
predictions = bn.predict(discretised_data, "G1")
print('The prediction is \'{prediction}\''.format(prediction=predictions.loc[18, 'G1_prediction']))
print('The ground truth is \'{truth}\''.format(truth=discretised_data.loc[18, 'G1']))

# 평가
classification_report(bn, test, "G1")

roc, auc = roc_auc(bn, test, "G1")
print(auc)


# 한계(Marginal) 확률 베이스라인 (위와 같음)
bn = bn.fit_cpds(discretised_data, method="BayesianEstimator", bayes_prior="K2")

# 모든 상태와 노드에 대해서 한계(Marginal) 우도(Likelihood) 계산
ie = InferenceEngine(bn)