Example #1
0
    def initialize_tabular_cpd(self, cid: BayesianModel) -> bool:
        """Initialize the probability table for the inherited TabularCPD

        Returns True if successful, False otherwise
        """
        if not self.parents_instantiated(cid):
            return False
        poss_values = self.possible_values(cid)
        if self.force_state_names:
            state_names_list = self.force_state_names
            if not set(poss_values).issubset(state_names_list):
                raise Exception(
                    "variable {} can take value outside given state_names".
                    format(self.variable))
        else:
            state_names_list = poss_values

        card = len(state_names_list)
        evidence = cid.get_parents(self.variable)
        evidence_card = [cid.get_cardinality(p) for p in evidence]
        matrix = np.array([[
            int(self.f(*i) == t)
            for i in itertools.product(*self.parent_values(cid))
        ] for t in state_names_list])
        state_names = {self.variable: state_names_list}

        super().__init__(self.variable,
                         card,
                         matrix,
                         evidence,
                         evidence_card,
                         state_names=state_names)
        return True
def create_network(max_parents, max_children, connections):
    
    '''
    Takes in max parents / chilldren per node constraints and network definition (connections). 
    Outputs model and list of nodes included in network.
    '''
    
    model = BayesianModel([connections[0]])
    nodes_added = []
    
    for edge in connections[1:]:
            #whichever has more children 
            #Node A
            try:
                if edge[0] in nodes_added and edge[1] in nodes_added: #both already in network
                    if len(model.get_children(edge[0])) < max_children and len(model.get_children(edge[1])) < max_children:
                        if len(model.get_parents(edge[0])) < max_parents and len(model.get_parents(edge[1])) < max_parents:
                            add_first = random.choice([0,1])
                            model.add_edge(edge[add_first],edge[1-add_first])

                        if len(model.get_parents(edge[0])) < max_parents and len(model.get_parents(edge[1])) >= max_parents:
                            model.add_edge(edge[1],edge[0])

                        if len(model.get_parents(edge[0])) >= max_parents and len(model.get_parents(edge[1])) < max_parents:
                            model.add_edge(edge[0],edge[1])

                    if len(model.get_children(edge[0])) < max_children and len(model.get_children(edge[1])) >= max_children and len(model.get_parents(edge[1])) < max_parents:
                        model.add_edge(edge[0],edge[1])

                    if len(model.get_children(edge[0])) >= max_children and len(model.get_children(edge[1])) < max_children and len(model.get_parents(edge[0])) < max_parents:
                        model.add_edge(edge[1],edge[0])

                elif edge[0] in nodes_added and edge[1] not in nodes_added: #Node A in network Node B not in network
                    if len(model.get_children(edge[0])) < max_children:
                        model.add_edge(edge[0],edge[1])
                        nodes_added.append(edge[1])

                elif edge[0] not in nodes_added and edge[1] in nodes_added: #Node A not in network Node B in network
                    if len(model.get_children(edge[1])) < max_children:
                        model.add_edge(edge[1],edge[0])
                        nodes_added.append(edge[0])

                else:
                    #neither in network, choose randomly
                    add_first = random.choice([0,1])
                    model.add_edge(edge[add_first],edge[1-add_first])
                    nodes_added.append(edge[add_first])
                    nodes_added.append(edge[1-add_first])

            except ValueError: #catch non-DAG error
                pass

    #if node is leaf, connect to target
    for node in nodes_added:
        if node in model.get_leaves():
            model.add_edge(node,'target')
            
    return model, nodes_added
Example #3
0
 def initialize_tabular_cpd(self, cid: BayesianModel) -> bool:
     """initialize the TabularCPD with a matrix representing a uniform random distribution"""
     parents = cid.get_parents(self.variable)
     # check that parents are initialized
     for parent in parents:
         if not cid.get_cpds(parent):
             return False
     parents_card = [cid.get_cardinality(p) for p in parents]
     transition_matrix = np.ones(
         (self.variable_card,
          np.product(parents_card).astype(int))) / self.variable_card
     super().__init__(self.variable,
                      self.variable_card,
                      transition_matrix,
                      parents,
                      parents_card,
                      state_names=self.state_names)
     return True
def causal_query(target: str, evidence: Dict, actions: Dict,
                 model: BayesianModel):
    queries = {}
    #Pearl Adjustment formula
    for action in actions.keys():
        z = set(model.get_parents(action))

        inference_engine = VariableElimination(model)
        base_query = inference_engine.query(variables=[target, action],
                                            evidence={**evidence},
                                            show_progress=False)
        if len(z) != 0:
            adjustment_query = inference_engine.query(variables=z,
                                                      show_progress=False)
            query = base_query.product(adjustment_query, inplace=False)
        else:
            query = base_query

        query.normalize()
        query.marginalize(z)
        queries.update({action: query})
    return queries
Example #5
0
class BayesianNetwork:
    """
    Base class for Bayesian Network (BN), a probabilistic weighted DAG where nodes represent variables,
    edges represent the causal relationships between variables.

    ``BayesianNetwork`` stores nodes with their possible states, edges and
    conditional probability distributions (CPDs) of each node.

    ``BayesianNetwork`` is built on top of the ``StructureModel``, which is an extension of ``networkx.DiGraph``
    (see :func:`causalnex.structure.structuremodel.StructureModel`).

    In order to define the ``BayesianNetwork``, users should provide a relevant ``StructureModel``.
    Once ``BayesianNetwork`` is initialised, no changes to the ``StructureModel`` can be made
    and CPDs can be learned from the data.

    The learned CPDs can be then used for likelihood estimation and predictions.

    Example:
    ::
        >>> # Create a Bayesian Network with a manually defined DAG.
        >>> from causalnex.structure import StructureModel
        >>> from causalnex.network import BayesianNetwork
        >>>
        >>> sm = StructureModel()
        >>> sm.add_edges_from([
        >>>                    ('rush_hour', 'traffic'),
        >>>                    ('weather', 'traffic')
        >>>                    ])
        >>> bn = BayesianNetwork(sm)
        >>> # A created ``BayesianNetwork`` stores nodes and edges defined by the ``StructureModel``
        >>> bn.nodes
        ['rush_hour', 'traffic', 'weather']
        >>>
        >>> bn.edges
        [('rush_hour', 'traffic'), ('weather', 'traffic')]
        >>> # A ``BayesianNetwork`` doesn't store any CPDs yet
        >>> bn.cpds
        >>> {}
        >>>
        >>> # Learn the nodes' states from the data
        >>> import pandas as pd
        >>> data = pd.DataFrame({
        >>>                      'rush_hour': [True, False, False, False, True, False, True],
        >>>                      'weather': ['Terrible', 'Good', 'Bad', 'Good', 'Bad', 'Bad', 'Good'],
        >>>                      'traffic': ['heavy', 'light', 'heavy', 'light', 'heavy', 'heavy', 'heavy']
        >>>                      })
        >>> bn = bn.fit_node_states(data)
        >>> bn.node_states
        {'rush_hour': {False, True}, 'weather': {'Bad', 'Good', 'Terrible'}, 'traffic': {'heavy', 'light'}}
        >>> # Learn the CPDs from the data
        >>> bn = bn.fit_cpds(data)
        >>> # Use the learned CPDs to make predictions on the unseen data
        >>> test_data = pd.DataFrame({
        >>>                           'rush_hour': [False, False, True, True],
        >>>                           'weather': ['Good', 'Bad', 'Good', 'Bad']
        >>>                           })
        >>> bn.predict(test_data, "traffic").to_dict()
        >>> {'traffic_prediction': {0: 'light', 1: 'heavy', 2: 'heavy', 3: 'heavy'}}
        >>> bn.predict_probability(test_data, "traffic").to_dict()
        {'traffic_prediction': {0: 'light', 1: 'heavy', 2: 'heavy', 3: 'heavy'}}
        {'traffic_light': {0: 0.75, 1: 0.25, 2: 0.3333333333333333, 3: 0.3333333333333333},
         'traffic_heavy': {0: 0.25, 1: 0.75, 2: 0.6666666666666666, 3: 0.6666666666666666}}
    """
    def __init__(self, structure: StructureModel):
        """
        Create a ``BayesianNetwork`` with a DAG defined by ``StructureModel``.

        Args:
            structure: a graph representing a causal relationship between variables.
                       In the structure
                           - cycles are not allowed;
                           - multiple (parallel) edges are not allowed;
                           - isolated nodes and multiple components are not allowed.

        Raises:
            ValueError: If the structure is not a connected DAG.
        """
        n_components = nx.number_weakly_connected_components(structure)

        if n_components > 1:
            raise ValueError(
                "The given structure has {n_components} separated graph components. "
                "Please make sure it has only one.".format(
                    n_components=n_components))

        if not nx.is_directed_acyclic_graph(structure):
            cycle = nx.find_cycle(structure)
            raise ValueError(
                "The given structure is not acyclic. Please review the following cycle: {cycle}"
                .format(cycle=cycle))

        # _node_states is a Dict in the form `dict: {node: dict: {state: index}}`.
        # Underlying libraries expect all states to be integers from zero, and
        # thus this dict is used to convert from state -> idx, and then back from idx -> state as required
        self._node_states = None  # type: Dict[str: Dict[Hashable, int]]
        self._structure = structure

        # _model is a pgmpy Bayesian Model.
        # It is used for:
        #                - probability fitting
        #                - predictions
        self._model = BayesianModel()
        self._model.add_edges_from(structure.edges)

    @property
    def structure(self) -> StructureModel:
        """
        ``StructureModel`` defining the DAG of the Bayesian Network.

        Returns:
            A ``StructureModel`` of the Bayesian Network.
        """
        return self._structure

    @property
    def nodes(self) -> List[str]:
        """
        List of all nodes contained within the Bayesian Network.

        Returns:
            A list of node names.
        """
        return list(self._model.nodes)

    @property
    def node_states(self) -> Dict[str, Set[Hashable]]:
        """
        Dictionary of all states that each node can take.

        Returns:
            A dictionary of node and its possible states, in format of `dict: {node: state}`.
        """
        return {
            node: set(states.keys())
            for node, states in self._node_states.items()
        }

    @node_states.setter
    def node_states(self, nodes: Dict[str, Set[Hashable]]):
        """
        Set the list of nodes that are contained within the Bayesian Network.
        The states of all nodes must be provided.

        Args:
            nodes: A dictionary of node and its possible states, in format of `dict: {node: state}`.

        Raises:
            ValueError: if a node contains a None state.
            KeyError: if a node is missing.
        """
        missing_feature = set(self.nodes).difference(set(nodes.keys()))
        if missing_feature:
            raise KeyError(
                "The data does not cover all the features found in the Bayesian Network. "
                "Please check the following features: {nodes}".format(
                    nodes=missing_feature))

        for node, states in nodes.items():
            if any(pd.isnull(list(states))):
                raise ValueError(
                    "node '{node}' contains None state".format(node=node))
        self._node_states = {
            n: {v: k
                for k, v in enumerate(sorted(nodes[n]))}
            for n in nodes
        }

    @property
    def edges(self) -> List[Tuple[str, str]]:
        """
        List of all edges contained within the Bayesian Network, as a Tuple(from_node, to_node).

        Returns:
            A list of all edges.
        """
        return list(self._model.edges)

    @property
    def cpds(self) -> Dict[str, pd.DataFrame]:
        """
        Conditional Probability Distributions of each node within the Bayesian Network.

        The row-index of each dataframe is all possible states for the node.
        The col-index of each dataframe is a MultiIndex that describes all possible permutations of parent states.

        For example, for a node :math:`P(A | B, D)`, where
        .. math::
            - A \\in \\text{{"a", "b", "c", "d"}}
            - B \\in \\text{{"x", "y", "z"}}
            - C \\in \\text{{False, True}}

        >>> b         x                   y               z
        >>> d     False     True      False True      False     True
        >>> a
        >>> a  0.265306  0.214286  0.066667  0.25  0.444444  0.000000
        >>> b  0.183673  0.214286  0.200000  0.25  0.222222  0.666667
        >>> c  0.285714  0.285714  0.400000  0.25  0.333333  0.333333
        >>> d  0.265306  0.285714  0.333333  0.25  0.000000  0.000000

        Returns:
            Conditional Probability Distributions of each node within the Bayesian Network.
        """
        cpds = dict()
        for cpd in self._model.cpds:

            iterables = [
                sorted(self._node_states[var].keys())
                for var in cpd.variables[1:]
            ]
            cols = [""]
            if iterables:
                cols = pd.MultiIndex.from_product(iterables,
                                                  names=cpd.variables[1:])

            cpds[cpd.variable] = pd.DataFrame(
                cpd.values.reshape(len(self._node_states[cpd.variable]),
                                   max(1, len(cols))))
            cpds[cpd.variable][cpd.variable] = sorted(
                self._node_states[cpd.variable].keys())
            cpds[cpd.variable].set_index([cpd.variable], inplace=True)
            cpds[cpd.variable].columns = cols

        return cpds

    def fit_node_states(self, df: pd.DataFrame) -> "BayesianNetwork":
        """
        Fit all states of nodes that can appear in the data.
        The dataframe provided should contain every possible state (values that can be taken) for every column.

        Args:
            df: data to fit node states from. Each column indicates a node and each row
                an observed combination of states.

        Returns:
            self

        Raises:
            ValueError: if dataframe contains any missing data.
        """
        self.node_states = {c: set(df[c].unique()) for c in df.columns}

        return self

    def _state_to_index(self,
                        df: pd.DataFrame,
                        nodes: List[str] = None) -> pd.DataFrame:
        """
        Transforms all values in df to an integer, as defined by the mapping from fit_node_states.

        Args:
            df: data to transform
            nodes: list of nodes to map to index. None means all.

        Returns:
            The transformed dataframe.

        Raises:
            ValueError: if nodes have not been fit, or if column names do not match node names.
        """

        df.is_copy = False
        cols = nodes if nodes else df.columns
        for col in cols:
            df[col] = df[col].map(self._node_states[col])
        df.is_copy = True
        return df

    def fit_cpds(
        self,
        data: pd.DataFrame,
        method: str = "MaximumLikelihoodEstimator",
        bayes_prior: str = None,
        equivalent_sample_size: int = None,
    ) -> "BayesianNetwork":
        """
        Learn conditional probability distributions for all nodes in the Bayesian Network, conditioned on
        their incoming edges (parents).

        Args:
            data: dataframe containing one column per node in the Bayesian Network.
            method: how to fit probabilities. One of:
                    - "MaximumLikelihoodEstimator": fit probabilities using Maximum Likelihood Estimation;
                    - "BayesianEstimator": fit probabilities using Bayesian Parameter Estimation. Use bayes_prior.
            bayes_prior: how to construct the Bayesian prior used by method="BayesianEstimator". One of:
                         - "K2": shorthand for dirichlet where all pseudo_counts are 1
                                 regardless of variable cardinality;
                         - "BDeu": equivalent of using Dirichlet and using uniform 'pseudo_counts' of
                                   `equivalent_sample_size / (node_cardinality * np.prod(parents_cardinalities))`
                                   for each node. Use equivelant_sample_size.
            equivalent_sample_size: used by BDeu bayes_prior to compute pseudo_counts.

        Returns:
            self

        Raises:
            ValueError: if an invalid method or bayes_prior is specified.

        """

        state_names = {
            k: list(v.values())
            for k, v in self._node_states.items()
        }

        transformed_data = data.copy(deep=True)  # type: pd.DataFrame
        transformed_data = self._state_to_index(transformed_data[self.nodes])

        if method == "MaximumLikelihoodEstimator":
            self._model.fit(
                data=transformed_data,
                estimator=MaximumLikelihoodEstimator,
                state_names=state_names,
            )

        elif method == "BayesianEstimator":
            valid_bayes_priors = ["BDeu", "K2"]
            if bayes_prior not in valid_bayes_priors:
                raise ValueError(
                    "unrecognised bayes_prior, please use on of %s" %
                    " ".join(valid_bayes_priors))

            self._model.fit(
                data=transformed_data,
                estimator=BayesianEstimator,
                prior_type=bayes_prior,
                equivalent_sample_size=equivalent_sample_size,
                state_names=state_names,
            )
        else:
            valid_methods = ["MaximumLikelihoodEstimator", "BayesianEstimator"]
            raise ValueError("unrecognised method, please use on of %s" %
                             " ".join(valid_methods))

        return self

    def fit_node_states_and_cpds(
        self,
        data: pd.DataFrame,
        method: str = "MaximumLikelihoodEstimator",
        bayes_prior: str = None,
        equivalent_sample_size: int = None,
    ) -> "BayesianNetwork":
        """
        Call `fit_node_states` and then `fit_cpds`.

        Args:
            data: dataframe containing one column per node in the Bayesian Network.
            method: how to fit probabilities. One of:
                    - "MaximumLikelihoodEstimator": fit probabilities using Maximum Likelihood Estimation;
                    - "BayesianEstimator": fit probabilities using Bayesian Parameter Estimation. Use bayes_prior.
            bayes_prior: how to construct the Bayesian prior used by method="BayesianEstimator". One of:
                         - "K2": shorthand for dirichlet where all pseudo_counts are 1
                                 regardless of variable cardinality;
                         - "BDeu": equivalent of using dirichlet and using uniform 'pseudo_counts' of
                                   `equivalent_sample_size / (node_cardinality * np.prod(parents_cardinalities))`
                                   for each node. Use equivelant_sample_size.
            equivalent_sample_size: used by BDeu bayes_prior to compute pseudo_counts.

        Returns:
            self
        """

        return self.fit_node_states(data).fit_cpds(data, method, bayes_prior,
                                                   equivalent_sample_size)

    def predict(self, data: pd.DataFrame, node: str) -> pd.DataFrame:
        """
        Predict the state of a node based on some input data, using the Bayesian Network.

        Args:
            data: data to make prediction.
            node: the node to predict.

        Returns:
            A dataframe of predictions, containing a single column name {node}_prediction.
        """

        if all(parent in data.columns
               for parent in self._model.get_parents(node)):
            return self._predict_from_complete_data(data, node)

        return self._predict_from_incomplete_data(data, node)

    def _predict_from_complete_data(self, data: pd.DataFrame,
                                    node: str) -> pd.DataFrame:
        """
        Predicts state of node given all parents of node exist within data.
        This method inspects the CPD of node directly, since all parent states are known.
        This avoids traversing the full network to compute marginals.
        This method is fast.

        Args:
            data: data to make prediction.
            node: the node to predict.

        Returns:
            A dataframe of predictions, containing a single column named {node}_prediction.
        """
        transformed_data = data.copy(deep=True)  # type: pd.DataFrame

        parents = sorted(self._model.get_parents(node))
        cpd = self.cpds[node]

        transformed_data["{node}_prediction".format(
            node=node)] = transformed_data.apply(
                lambda row: cpd[tuple(row[parent]
                                      for parent in parents)].idxmax()
                if parents else cpd[""].idxmax(),
                axis=1,
            )
        return transformed_data[[node + "_prediction"]]

    def _predict_from_incomplete_data(self, data: pd.DataFrame,
                                      node: str) -> pd.DataFrame:
        """
        Predicts state of node when some parents of node do not exist within data.
        This method uses the pgmpy predict function, which predicts the most likely state for every node
        that is not contained within data.
        With incomplete data, pgmpy goes beyond parents in the network to determine the most likely predictions.
        This method is slow.

        Args:
            data: data to make prediction.
            node: the node to predict.

        Returns:
            A dataframe of predictions, containing a single column name {node}_prediction.
        """

        transformed_data = deepcopy(data)  # type: pd.DataFrame
        self._state_to_index(transformed_data)

        # transformed_data.is_copy()

        # pgmpy will predict all missing data, so drop column we want to predict
        transformed_data = transformed_data.drop(columns=[node])

        predictions = self._model.predict(transformed_data)[[node]]

        return predictions.rename(columns={node: node + "_prediction"})

    def predict_probability(self, data: pd.DataFrame,
                            node: str) -> pd.DataFrame:
        """
        Predict the probability of each possible state of a node, based on some input data.

        Args:
            data: data to make prediction.
            node: the node to predict probabilities.

        Returns:
            A dataframe of predicted probabilities, contained one column per possible state, named {node}_{state}.
        """

        if all(parent in data.columns
               for parent in self._model.get_parents(node)):
            return self._predict_probability_from_complete_data(data, node)

        return self._predict_probability_from_incomplete_data(data, node)

    def _predict_probability_from_complete_data(self, data: pd.DataFrame,
                                                node: str) -> pd.DataFrame:
        """
        Predict the probability of each possible state of a node, based on some input data.
        This method inspects the CPD of node directly, since all parent states are known.
        This avoids traversing the full network to compute marginals.
        This method is fast.

        Args:
            data: data to make prediction.
            node: the node to predict probabilities.

        Returns:
            A dataframe of predicted probabilities, contained one column per possible state, named {node}_{state}.
        """
        transformed_data = data.copy(deep=True)  # type: pd.DataFrame

        parents = sorted(self._model.get_parents(node))
        cpd = self.cpds[node]

        def lookup_probability(row, s):
            """Retrieve probability from CPD"""
            if parents:
                return cpd[tuple(row[parent] for parent in parents)].loc[s]
            return cpd.at[s, ""]

        for state in self.node_states[node]:
            transformed_data["{n}_{s}".format(
                n=node, s=state)] = transformed_data.apply(
                    lambda row, st=state: lookup_probability(row, st), axis=1)

        return transformed_data[[
            "{n}_{s}".format(n=node, s=state)
            for state in self.node_states[node]
        ]]

    def _predict_probability_from_incomplete_data(self, data: pd.DataFrame,
                                                  node: str) -> pd.DataFrame:
        """
        Predict the probability of each possible state of a node, based on some input data.
        This method uses the pgmpy predict_probability function, which predicts the probability
        of every state for every node that is not contained within data.
        With incomplete data, pgmpy goes beyond parents in the network to determine the most likely predictions.
        This method is slow.

        Args:
            data: data to make prediction.
            node: the node to predict probabilities.

        Returns:
            A dataframe of predicted probabilities, contained one column per possible state, named {node}_{state}.
        """
        transformed_data = data.copy(deep=True)  # type: pd.DataFrame
        self._state_to_index(transformed_data)

        # pgmpy will predict all missing data, so drop column we want to predict
        transformed_data = transformed_data.drop(columns=[node])

        probability = self._model.predict_probability(
            transformed_data)  # type: pd.DataFrame

        # keep only probabilities for the node we are interested in
        cols = []
        pattern = re.compile("^{node}_[0-9]+$".format(node=node))
        # disabled open pylint issue (https://github.com/PyCQA/pylint/issues/2962)
        for col in probability.columns:
            if pattern.match(col):
                cols.append(col)
        probability = probability[cols]
        probability.columns = cols

        return probability
Example #6
0
class Network_handler:
    '''
    Handles creation and usage of the probabilistic network over CERN's data.
    Can deal only with a SINGLE file-priority combination.
    Note that the methods of this class have numbers and must be called in order.
    '''
    def __init__(self, pnh, gh):
        '''
        Constructor
        '''
        extractor = pnh.get_data_extractor()
        self.best_model = BayesianModel()
        self.training_instances = ""
        self.device_considered = pnh.get_device()
        self.priority_considered = pnh.get_priority()
        self.markov = MarkovModel()
        self.general_handler = gh
        self.variables_names = extractor.get_variable_names()
        self.rankedDevices = extractor.get_ranked_devices()
        self.data = pnh.get_dataframe()
        self.file_writer = pnh.get_file_writer()
        self.file_suffix = pnh.get_file_suffix()

    def learn_structure(self, method, scoring_method, log=True):
        ''' (4)
        Method that builds the structure of the data
        -----------------
        Parameters:
        method          : The technique used to search for the structure
            -> scoring_approx     - To use an approximated search with scoring method
            -> scoring_exhaustive - To use an exhaustive search with scoring method
            -> constraint         - To use the constraint based technique
        scoring_method : K2, bic, bdeu
        log             - "True" if you want to print debug information in the console    
        '''

        #Select the scoring method for the local search of the structure
        if scoring_method == "K2":
            scores = K2Score(self.data)
        elif scoring_method == "bic":
            scores = BicScore(self.data)
        elif scoring_method == "bdeu":
            scores = BdeuScore(self.data)

        #Select the actual method
        if method == "scoring_approx":
            est = HillClimbSearch(self.data, scores)
        elif method == "scoring_exhaustive":
            est = ExhaustiveSearch(self.data, scores)
        elif method == "constraint":
            est = ConstraintBasedEstimator(self.data)

        self.best_model = est.estimate()
        self.eliminate_isolated_nodes(
        )  # REMOVE all nodes not connected to anything else

        for edge in self.best_model.edges_iter():
            self.file_writer.write_txt(str(edge))

        self.log("Method used for structural learning: " + method, log)
        #self.log("Training instances skipped: " + str(self.extractor.get_skipped_lines()), log)
        self.log("Search terminated", log)

    def estimate_parameters(self, log=True):
        ''' (5)
        Estimates the parameters of the found network
        '''
        estimator = BayesianEstimator(self.best_model, self.data)
        self.file_writer.write_txt("Number of nodes: " +
                                   str(len(self.variables_names)))
        self.file_writer.write_txt("Complete list: " +
                                   str(self.variables_names))

        for node in self.best_model.nodes():
            cpd = estimator.estimate_cpd(node, prior_type='K2')
            self.best_model.add_cpds(cpd)
            self.log(cpd, log)
            self.file_writer.write_txt(cpd.__str__())

    def inference(self, variables, evidence, mode="auto", log=True):
        ''' (6)
        Computes the inference over some variables of the network (given some evidence)
        '''

        inference = VariableElimination(self.best_model)
        #inference = BeliefPropagation(self.markov)
        #inference = Mplp(self.best_model)
        header = "------------------- INFERENCE ------------------------"
        self.log(header, log)
        self.file_writer.write_txt(header, newline=True)
        self.file_writer.write_txt("(With parents all set to value 1)")

        if mode == "auto":
            self.log("          (with parents all set to value 1)", log)
            for node in self.best_model.nodes():
                variables = [node]
                parents = self.best_model.get_parents(node)
                evidence = dict()
                for p in parents:
                    evidence[p] = 1
                phi_query = inference.query(variables, evidence)
                for key in phi_query:
                    self.file_writer.write_txt(str(phi_query[key]))
                    self.log(phi_query[key], log)

        elif mode == "manual":
            phi_query = inference.query(variables, evidence)
            for key in phi_query:
                self.log(phi_query[key], log)
            '''
            map_query = inference.map_query(variables, evidence)
            print(map_query)
            '''

    def draw_network(self, label_choice, location_choice, location, log):
        ''' (7) 
        Draws the bayesian network.
        ----
        location_choice = True iff we want to show the location of devices in the graph.
        label_choice = "single" if we want to show single label, "double" for double label of arcs
        location = 0,1,2 depending by the location (H0, H1, H2)
        '''
        bn_graph = gv.Digraph(format="png")

        # Extract color based on the building
        if location_choice:

            devices = self.variables_names
            device_location = dict()
            device_locationH1 = dict()

            #For H0
            for d in devices:
                allDevicesLocations = self.general_handler.get_device_locations(
                )
                device_location[d] = allDevicesLocations[d][0]
                device_locationH1[d] = allDevicesLocations[d][1]  #temp for H1
            location_color = self.assign_color(device_location)
            location_colorH1 = self.assign_color(device_locationH1)
            '''
            # Logging and saving info
            self.log(device_location, log)
            self.log(location_color, log)
            self.file_writer.write_txt(device_location, newline = True)
            self.file_writer.write_txt(location_color, newline = True)
            '''

            # Creating the subgraphs, one for each location:
            loc_subgraphs = dict()
            for loc in location_color:
                name = "cluster_" + loc
                loc_subgraphs[loc] = gv.Digraph(name)
                loc_subgraphs[loc].graph_attr[
                    'label'] = loc  #Label with name to be visualized in the image

        # Create nodes
        for node in self.best_model.nodes():
            if location_choice:
                locationH0 = device_location[node]
                locationH1 = device_locationH1[node]
                loc_subgraphs[locationH0].node(
                    node,
                    style='filled',
                    fillcolor=location_colorH1[locationH1]
                )  #add the node to the right subgraph
                #loc_subgraphs[locationH0].node(node) #USE THIS TO ADD ONLY H0
            else:
                bn_graph.node(node)

        # Add all subgraphs in the final graph:
        if location_choice:
            for loc in loc_subgraphs:
                bn_graph.subgraph(loc_subgraphs[loc])

        # Create and color edges
        for edge in self.best_model.edges_iter():
            inference = VariableElimination(self.best_model)
            label = ""

            # Inference for first label and color of edges
            variables = [edge[1]]
            evidence = dict()
            evidence[edge[0]] = 1
            phi_query = inference.query(variables, evidence)
            value = phi_query[edge[1]].values[1]
            value = round(value, 2)

            if label_choice == "single":
                label = str(value)

            if label_choice == "double":
                # Inference for second label
                variables = [edge[0]]
                evidence = dict()
                evidence[edge[1]] = 1
                phi_query = inference.query(variables, evidence)
                value_inv = phi_query[edge[0]].values[1]
                value_inv = round(value_inv, 2)
                label = str(value) + "|" + str(value_inv)

            if value >= 0.75:
                bn_graph.edge(edge[0], edge[1], color="red", label=label)
            else:
                bn_graph.edge(edge[0], edge[1], color="black", label=label)

        # Save the .png graph
        if self.device_considered == "CUSTOM":
            imgPath = '../../output/CUSTOM' + self.file_suffix
        else:
            if location_choice:
                locat = "_H0H1"
            else:
                locat = ""
            imgPath = '../../output/' + self.device_considered + '_' + self.priority_considered + locat
        bn_graph.render(imgPath)
        os.remove(imgPath)  #remove the source code generated by graphviz

    def data_info(self, selection, log):
        ''' (9) Prints or logs some extra information about the data or the network
        '''
        # 1 - DEVICE FREQUENCY AND OCCURRENCES
        if 1 in selection:
            self.file_writer.write_txt(
                "Device ranking (max 20 devices are visualized)", newline=True)
            i = 1
            for dr in self.rankedDevices:
                self.file_writer.write_txt(dr[0] + "             \t" +
                                           str(dr[1]) + "\t" + str(dr[2]))
                i = i + 1
                if i == 20:
                    break

        # 2 - EDGES OF THE NETWORK
        if 2 in selection:
            self.file_writer.write_txt("Edges of the network:", newline=True)
            for edge in self.best_model.edges_iter():
                self.file_writer.write_txt(str(edge))

        # 3 - MARKOV NETWORK
        if 3 in selection:
            self.markov = self.best_model.to_markov_model(
            )  #create the markov model from the BN
            nice_graph = pydot.Dot(graph_type='graph')
            for node in self.markov.nodes():
                node_pydot = pydot.Node(node)
                nice_graph.add_node(node_pydot)
            for edge in self.markov.edges():
                edge_pydot = pydot.Edge(edge[0], edge[1], color="black")
                nice_graph.add_edge(edge_pydot)
            nice_graph.write_png('../../output/' + self.device_considered +
                                 '_' + self.priority_considered +
                                 '-markov.png')

            self.file_writer.write_txt("MARKOV NETWORK FACTORS:", newline=True)
            for factor in self.markov.factors:
                self.log("MARKOV---------------------------------------", log)
                self.log(factor, log)
                self.file_writer.write_txt(factor.__str__())

        # 4 - INFERENCE NETWORK
        if 4 in selection:
            nice_graph = pydot.Dot(graph_type='digraph')
            nodes = self.best_model.nodes()
            inference = VariableElimination(self.best_model)
            for node1 in nodes:
                pos = nodes.index(node1) + 1
                for i in range(pos, len(nodes)):
                    node2 = nodes[i]
                    variables = [node2]
                    evidence = dict()
                    evidence[node1] = 1
                    phi_query = inference.query(variables, evidence)
                    prob1 = phi_query[node2].values[
                        1]  #probability of direct activation (inference from node1=1 to node2)
                    variables = [node1]
                    evidence = dict()
                    evidence[node2] = 1
                    phi_query = inference.query(variables, evidence)
                    prob2 = phi_query[node1].values[
                        1]  #probability of inverse activation (inference from node2=1 to node1)
                    prob1 = round(prob1, 2)
                    prob2 = round(prob2, 2)
                    if prob1 >= 0.75 and (
                            prob1 - prob2
                    ) <= 0.40:  #add direct arc from node1 to node2
                        ls = [node1, node2]
                        self.fix_node_presence(ls, nice_graph)
                        double_label = str(prob1) + "|" + str(prob2)
                        nice_graph.add_edge(
                            pydot.Edge(node1,
                                       node2,
                                       color="red",
                                       label=double_label))
                    elif prob2 >= 0.75 and (prob2 - prob1) <= 0.40:
                        ls = [node1, node2]
                        self.fix_node_presence(ls, nice_graph)
                        double_label = str(prob2) + "|" + str(prob1)
                        nice_graph.add_edge(
                            pydot.Edge(node2,
                                       node1,
                                       color="red",
                                       label=double_label))
                    elif prob1 >= 0.75 and prob2 >= 0.75:
                        ls = [node1, node2]
                        self.fix_node_presence(ls, nice_graph)
                        if prob1 >= prob2:
                            double_label = str(prob1) + "|" + str(prob2)
                            nice_graph.add_edge(
                                pydot.Edge(node1,
                                           node2,
                                           color="orange",
                                           label=double_label))
                        else:
                            double_label = str(prob2) + "|" + str(prob1)
                            nice_graph.add_edge(
                                pydot.Edge(node2,
                                           node1,
                                           color="orange",
                                           label=double_label))
                    elif prob1 >= 0.55 and prob2 >= 0.55:
                        ls = [node1, node2]
                        self.fix_node_presence(ls, nice_graph)
                        if prob1 >= prob2:
                            double_label = str(prob1) + "|" + str(prob2)
                            nice_graph.add_edge(
                                pydot.Edge(node1,
                                           node2,
                                           color="black",
                                           label=double_label))
                        else:
                            double_label = str(prob2) + "|" + str(prob1)
                            nice_graph.add_edge(
                                pydot.Edge(node2,
                                           node1,
                                           color="black",
                                           label=double_label))

            if self.device_considered == "CUSTOM":
                imgPath = '../../output/CUSTOM' + self.file_suffix
                nice_graph.write_png(imgPath + "-inference_network.png")
            else:
                nice_graph.write_png('../../output/' + self.device_considered +
                                     '_' + self.priority_considered +
                                     '-inference_network.png')

    def fix_node_presence(self, nodes, pydot_graph):
        ''' Adds the list of nodes to the graph, if they are not already present '''
        for node in nodes:
            if node not in pydot_graph.get_nodes():
                pydot_graph.add_node(pydot.Node(node))

    def eliminate_isolated_nodes(self):
        '''
        If a node doesn't have any incoming or outgoing edge, it is eliminated from the graph
        '''
        for nodeX in self.best_model.nodes():
            tup = [item for item in self.best_model.edges() if nodeX in item]
            if not tup:
                self.file_writer.write_txt(
                    "Node " + str(nodeX) +
                    " has no edges: it has been eliminated.")
                self.best_model.remove_node(nodeX)
        if self.best_model.nodes() == []:
            raise DataError("No nodes left in this file-priority combination.")

    def assign_color(self, device_location):
        '''
        Returns a dictionary with the location as key and the assigned colour as value (WORKS WITH MAX 10 DIFFERENT LOCATIONS)
        '''
        system_color = [
            'Blue', 'Green', 'Red', 'Purple', 'Yellow', 'Red', 'Grey',
            'Light Red', 'Light Blue', 'Light Green'
        ]
        location_color = dict()  # key = location; value = color
        for dev, loc in device_location.items():
            if loc not in location_color:
                color = system_color[0]
                system_color.remove(color)
                location_color[loc] = color
        return location_color

    def log(self, text, log):
        ''' Prints the text in the console, if the "log" condition is True. '''
        if log:
            print(text)
Example #7
0
class BayesianNetwork:
    """
    Base class for Bayesian Network (BN), a probabilistic weighted DAG where nodes represent variables,
    edges represent the causal relationships between variables.

    ``BayesianNetwork`` stores nodes with their possible states, edges and
    conditional probability distributions (CPDs) of each node.

    ``BayesianNetwork`` is built on top of the ``StructureModel``, which is an extension of ``networkx.DiGraph``
    (see :func:`causalnex.structure.structuremodel.StructureModel`).

    In order to define the ``BayesianNetwork``, users should provide a relevant ``StructureModel``.
    Once ``BayesianNetwork`` is initialised, no changes to the ``StructureModel`` can be made
    and CPDs can be learned from the data.

    The learned CPDs can be then used for likelihood estimation and predictions.

    Example:
    ::
        >>> # Create a Bayesian Network with a manually defined DAG.
        >>> from causalnex.structure import StructureModel
        >>> from causalnex.network import BayesianNetwork
        >>>
        >>> sm = StructureModel()
        >>> sm.add_edges_from([
        >>>                    ('rush_hour', 'traffic'),
        >>>                    ('weather', 'traffic')
        >>>                    ])
        >>> bn = BayesianNetwork(sm)
        >>> # A created ``BayesianNetwork`` stores nodes and edges defined by the ``StructureModel``
        >>> bn.nodes
        ['rush_hour', 'traffic', 'weather']
        >>>
        >>> bn.edges
        [('rush_hour', 'traffic'), ('weather', 'traffic')]
        >>> # A ``BayesianNetwork`` doesn't store any CPDs yet
        >>> bn.cpds
        >>> {}
        >>>
        >>> # Learn the nodes' states from the data
        >>> import pandas as pd
        >>> data = pd.DataFrame({
        >>>                      'rush_hour': [True, False, False, False, True, False, True],
        >>>                      'weather': ['Terrible', 'Good', 'Bad', 'Good', 'Bad', 'Bad', 'Good'],
        >>>                      'traffic': ['heavy', 'light', 'heavy', 'light', 'heavy', 'heavy', 'heavy']
        >>>                      })
        >>> bn = bn.fit_node_states(data)
        >>> bn.node_states
        {'rush_hour': {False, True}, 'weather': {'Bad', 'Good', 'Terrible'}, 'traffic': {'heavy', 'light'}}
        >>> # Learn the CPDs from the data
        >>> bn = bn.fit_cpds(data)
        >>> # Use the learned CPDs to make predictions on the unseen data
        >>> test_data = pd.DataFrame({
        >>>                           'rush_hour': [False, False, True, True],
        >>>                           'weather': ['Good', 'Bad', 'Good', 'Bad']
        >>>                           })
        >>> bn.predict(test_data, "traffic").to_dict()
        >>> {'traffic_prediction': {0: 'light', 1: 'heavy', 2: 'heavy', 3: 'heavy'}}
        >>> bn.predict_probability(test_data, "traffic").to_dict()
        {'traffic_prediction': {0: 'light', 1: 'heavy', 2: 'heavy', 3: 'heavy'}}
        {'traffic_light': {0: 0.75, 1: 0.25, 2: 0.3333333333333333, 3: 0.3333333333333333},
         'traffic_heavy': {0: 0.25, 1: 0.75, 2: 0.6666666666666666, 3: 0.6666666666666666}}
    """

    def __init__(self, structure: StructureModel):
        """
        Create a ``BayesianNetwork`` with a DAG defined by ``StructureModel``.

        Args:
            structure: a graph representing a causal relationship between variables.
                       In the structure
                           - cycles are not allowed;
                           - multiple (parallel) edges are not allowed;
                           - isolated nodes and multiple components are not allowed.

        Raises:
            ValueError: If the structure is not a connected DAG.
        """
        n_components = nx.number_weakly_connected_components(structure)

        if n_components > 1:
            raise ValueError(
                f"The given structure has {n_components} separated graph components. "
                "Please make sure it has only one."
            )

        if not nx.is_directed_acyclic_graph(structure):
            cycle = nx.find_cycle(structure)
            raise ValueError(
                f"The given structure is not acyclic. Please review the following cycle: {cycle}"
            )

        # _node_states is a Dict in the form `dict: {node: dict: {state: index}}`.
        # Underlying libraries expect all states to be integers from zero, and
        # thus this dict is used to convert from state -> idx, and then back from idx -> state as required
        self._node_states = {}  # type: Dict[str: Dict[Hashable, int]]
        self._structure = structure

        # _model is a pgmpy Bayesian Model.
        # It is used for:
        #                - probability fitting
        #                - predictions
        self._model = BayesianModel()
        self._model.add_edges_from(structure.edges)

    @property
    def structure(self) -> StructureModel:
        """
        ``StructureModel`` defining the DAG of the Bayesian Network.

        Returns:
            A ``StructureModel`` of the Bayesian Network.
        """
        return self._structure

    @property
    def nodes(self) -> List[str]:
        """
        List of all nodes contained within the Bayesian Network.

        Returns:
            A list of node names.
        """
        return list(self._model.nodes)

    @property
    def node_states(self) -> Dict[str, Set[Hashable]]:
        """
        Dictionary of all states that each node can take.

        Returns:
            A dictionary of node and its possible states, in format of `dict: {node: state}`.
        """
        return {node: set(states.keys()) for node, states in self._node_states.items()}

    @node_states.setter
    def node_states(self, nodes: Dict[str, Set[Hashable]]):
        """
        Set the list of nodes that are contained within the Bayesian Network.
        The states of all nodes must be provided.

        Args:
            nodes: A dictionary of node and its possible states, in format of `dict: {node: state}`.

        Raises:
            ValueError: if a node contains a None state.
            KeyError: if a node is missing.
        """
        missing_feature = set(self.nodes).difference(set(nodes.keys()))

        if missing_feature:
            raise KeyError(
                "The data does not cover all the features found in the Bayesian Network. "
                f"Please check the following features: {missing_feature}"
            )

        self._node_states = {}

        for node, states in nodes.items():
            if any(pd.isnull(list(states))):
                raise ValueError(f"node '{node}' contains None state")

            self._node_states[node] = {v: k for k, v in enumerate(sorted(states))}

    @property
    def edges(self) -> List[Tuple[str, str]]:
        """
        List of all edges contained within the Bayesian Network, as a Tuple(from_node, to_node).

        Returns:
            A list of all edges.
        """
        return list(self._model.edges)

    @property
    def cpds(self) -> Dict[str, pd.DataFrame]:
        """
        Conditional Probability Distributions of each node within the Bayesian Network.

        The row-index of each dataframe is all possible states for the node.
        The col-index of each dataframe is a MultiIndex that describes all possible permutations of parent states.

        For example, for a node :math:`P(A | B, D)`, where
        .. math::
            - A \\in \\text{{"a", "b", "c", "d"}}
            - B \\in \\text{{"x", "y", "z"}}
            - C \\in \\text{{False, True}}

        >>> b         x                   y               z
        >>> d     False     True      False True      False     True
        >>> a
        >>> a  0.265306  0.214286  0.066667  0.25  0.444444  0.000000
        >>> b  0.183673  0.214286  0.200000  0.25  0.222222  0.666667
        >>> c  0.285714  0.285714  0.400000  0.25  0.333333  0.333333
        >>> d  0.265306  0.285714  0.333333  0.25  0.000000  0.000000

        Returns:
            Conditional Probability Distributions of each node within the Bayesian Network.
        """
        cpds = {}

        for cpd in self._model.cpds:
            names = cpd.variables[1:]
            cols = [""]

            if names:
                cols = pd.MultiIndex.from_product(
                    [sorted(self._node_states[var].keys()) for var in names],
                    names=names,
                )

            cpds[cpd.variable] = pd.DataFrame(
                cpd.values.reshape(
                    len(self._node_states[cpd.variable]), max(1, len(cols))
                )
            )
            cpds[cpd.variable][cpd.variable] = sorted(
                self._node_states[cpd.variable].keys()
            )
            cpds[cpd.variable].set_index([cpd.variable], inplace=True)
            cpds[cpd.variable].columns = cols

        return cpds

    def set_cpd(self, node: str, df: pd.DataFrame) -> "BayesianNetwork":
        """
        Provide self-defined CPD to Bayesian Network

        Args:
            node: the node to add self-defined cpd.
            df: self-defined cpd in pandas DataFrame format.

        Returns:
            self

        Raises:
            IndexError: if the index names of the pandas DataFrame does not match the expected DataFrame.
            ValueError: if node does not exist in Bayesian Network or a bad cpd table is provided.
        """
        if node not in self.nodes:
            raise ValueError(f'Non-existing node "{node}"')

        # Check Table
        true_parents = {
            parent_node: self.node_states[parent_node]
            for parent_node in self._structure.predecessors(node)
        }
        table_parents = {
            name: set(df.columns.levels[i].values)
            for i, name in enumerate(df.columns.names)
        }
        if not (
            set(df.index.values) == self.node_states[node]
            and true_parents == table_parents
            and df.index.name == node
        ):
            raise IndexError("Wrong index values. Please check your indices")

        sorted_df = df.reindex(sorted(df.columns), axis=1)
        node_card = len(self.node_states[node])
        evidence, evidence_card = zip(
            *[(key, len(table_parents[key])) for key in sorted(table_parents.keys())]
        )
        tabular_cpd = TabularCPD(
            node,
            node_card,
            sorted_df.values,
            evidence=evidence,
            evidence_card=evidence_card,
        )
        model_copy = copy.deepcopy(self._model)
        model_copy.add_cpds(tabular_cpd)
        model_copy.check_model()

        self._model = model_copy
        return self

    def fit_node_states(self, df: pd.DataFrame) -> "BayesianNetwork":
        """
        Fit all states of nodes that can appear in the data.
        The dataframe provided should contain every possible state (values that can be taken) for every column.

        Args:
            df: data to fit node states from. Each column indicates a node and each row
                an observed combination of states.

        Returns:
            self

        Raises:
            ValueError: if dataframe contains any missing data.
        """
        self.node_states = {c: set(df[c].unique()) for c in df.columns}
        return self

    def _state_to_index(
        self,
        df: pd.DataFrame,
        nodes: Optional[List[str]] = None,
    ) -> pd.DataFrame:
        """
        Transforms all values in df to an integer, as defined by the mapping from fit_node_states.

        Args:
            df: data to transform
            nodes: list of nodes to map to index. None means all.

        Returns:
            The transformed dataframe.

        Raises:
            ValueError: if nodes have not been fit, or if column names do not match node names.
        """
        df.is_copy = False
        cols = nodes if nodes else df.columns

        for col in cols:
            df[col] = df[col].map(self._node_states[col])

        df.is_copy = True
        return df

    def fit_cpds(
        self,
        data: pd.DataFrame,
        method: str = "MaximumLikelihoodEstimator",
        bayes_prior: Optional[str] = None,
        equivalent_sample_size: Optional[int] = None,
    ) -> "BayesianNetwork":
        """
        Learn conditional probability distributions for all nodes in the Bayesian Network, conditioned on
        their incoming edges (parents).

        Args:
            data: dataframe containing one column per node in the Bayesian Network.
            method: how to fit probabilities. One of:
                    - "MaximumLikelihoodEstimator": fit probabilities using Maximum Likelihood Estimation;
                    - "BayesianEstimator": fit probabilities using Bayesian Parameter Estimation. Use bayes_prior.
            bayes_prior: how to construct the Bayesian prior used by method="BayesianEstimator". One of:
                         - "K2": shorthand for dirichlet where all pseudo_counts are 1
                                 regardless of variable cardinality;
                         - "BDeu": equivalent of using Dirichlet and using uniform 'pseudo_counts' of
                                   `equivalent_sample_size / (node_cardinality * np.prod(parents_cardinalities))`
                                   for each node. Use equivelant_sample_size.
            equivalent_sample_size: used by BDeu bayes_prior to compute pseudo_counts.

        Returns:
            self

        Raises:
            ValueError: if an invalid method or bayes_prior is specified.
        """
        state_names = {k: list(v.values()) for k, v in self._node_states.items()}

        transformed_data = data.copy(deep=True)  # type: pd.DataFrame
        transformed_data = self._state_to_index(transformed_data[self.nodes])

        if method == "MaximumLikelihoodEstimator":
            self._model.fit(
                data=transformed_data,
                estimator=MaximumLikelihoodEstimator,
                state_names=state_names,
            )

        elif method == "BayesianEstimator":
            valid_bayes_priors = ["BDeu", "K2"]
            if bayes_prior not in valid_bayes_priors:
                raise ValueError(
                    f"unrecognised bayes_prior, please use one of {valid_bayes_priors}"
                )

            self._model.fit(
                data=transformed_data,
                estimator=BayesianEstimator,
                prior_type=bayes_prior,
                equivalent_sample_size=equivalent_sample_size,
                state_names=state_names,
            )
        else:
            valid_methods = ["MaximumLikelihoodEstimator", "BayesianEstimator"]
            raise ValueError(f"unrecognised method, please use one of {valid_methods}")

        return self

    def fit_node_states_and_cpds(
        self,
        data: pd.DataFrame,
        method: str = "MaximumLikelihoodEstimator",
        bayes_prior: Optional[str] = None,
        equivalent_sample_size: Optional[int] = None,
    ) -> "BayesianNetwork":
        """
        Call `fit_node_states` and then `fit_cpds`.

        Args:
            data: dataframe containing one column per node in the Bayesian Network.
            method: how to fit probabilities. One of:
                    - "MaximumLikelihoodEstimator": fit probabilities using Maximum Likelihood Estimation;
                    - "BayesianEstimator": fit probabilities using Bayesian Parameter Estimation. Use bayes_prior.
            bayes_prior: how to construct the Bayesian prior used by method="BayesianEstimator". One of:
                         - "K2": shorthand for dirichlet where all pseudo_counts are 1
                                 regardless of variable cardinality;
                         - "BDeu": equivalent of using dirichlet and using uniform 'pseudo_counts' of
                                   `equivalent_sample_size / (node_cardinality * np.prod(parents_cardinalities))`
                                   for each node. Use equivelant_sample_size.
            equivalent_sample_size: used by BDeu bayes_prior to compute pseudo_counts.

        Returns:
            self
        """
        return self.fit_node_states(data).fit_cpds(
            data, method, bayes_prior, equivalent_sample_size
        )

    def add_node(
        self,
        node: str,
        edges_to_add: List[Tuple[str, str]],
        edges_to_remove: List[Tuple[str, str]],
    ) -> "BayesianNetwork":
        """
        Adding a latent variable to the structure model, as well as its corresponding edges

        Args:
            node: Name of the node
            edges_to_add: which edges to add to the structure
            edges_to_remove: which edges to remove from the structure

        Returns:
            self

        Raises:
            ValueError: If lv_name exists in the network or
                if `edges_to_add` include edges NOT containing the latent variable or
                if `edges_to_remove` include edges containing the latent variable
        """
        if any(node not in edges for edges in edges_to_add):
            raise ValueError(f"Should only add edges containing node '{node}'")
        if any(node in edges for edges in edges_to_remove):
            raise ValueError(f"Should only remove edges NOT containing node '{node}'")

        self._structure.add_edges_from(edges_to_add)
        self._structure.remove_edges_from(edges_to_remove)
        self._model.add_edges_from(edges_to_add)
        self._model.remove_edges_from(edges_to_remove)

        return self

    def fit_latent_cpds(  # pylint: disable=too-many-arguments
        self,
        lv_name: str,
        lv_states: List,
        data: pd.DataFrame,
        box_constraints: Optional[Dict[str, Tuple[pd.DataFrame, pd.DataFrame]]] = None,
        priors: Optional[Dict[str, pd.DataFrame]] = None,
        initial_params: Union[str, Dict[str, pd.DataFrame]] = "random",
        non_missing_data_factor: int = 1,
        n_runs: int = 20,
        stopping_delta: float = 0.0,
    ) -> "BayesianNetwork":
        """
        This runs the EM algorithm to estimate the CPDs of latent variables and their corresponding Markov blanket

        Args:
            lv_name: Latent variable name
            lv_states: the states the LV can assume
            data: dataframe, must contain all variables in the Markov Blanket of the latent variable. Include one column
                with the latent variable name, filled with np.nan for missing info about LV.
                If some data is present about the LV, create complete columns.
            n_runs: max number of EM alternations
            stopping_delta: if max difference in current - last iteration CPDS < stopping_delta => convergence reached
            initial_params: way to initialise parameters. Can be:
                - "random": random values (default)
                - "avg": uniform distributions everywhere. Not advised, as it may be the a stationary point on itself
                - if provide a dictionary of dataframes, this will be used as the initialisation
            box_constraints: minimum and maximum values for each model parameter. Specified with a dictionary mapping:
                - Node
                - two dataframes, in order: Min(P(Node|Par(Node))) and Max(P(Node|Par(Node)))
            priors: priors, provided as a mapping Node -> dataframe with Dirichilet priors for P(Node|Par(Node))
            non_missing_data_factor:
                This is a weight added to the non-missing data samples. The effect is as if the amount of data provided
                was bigger. Empirically, helps to set the factor to 10 if the non missing data is ~1% of the dataset

        Returns:
            self

        Raises:
            ValueError: if the latent variable is not a string or
                if the latent variable cannot be found in the network or
                if the latent variable is present/observed in the data
                if the latent variable states are empty
        """
        if not isinstance(lv_name, str):
            raise ValueError(f"Invalid latent variable name '{lv_name}'")
        if lv_name not in self._structure:
            raise ValueError(f"Latent variable '{lv_name}' not added to the network")
        if not isinstance(lv_states, list) or len(lv_states) == 0:
            raise ValueError(f"Latent variable '{lv_name}' contains no states")

        # Register states for the latent variable
        self._node_states[lv_name] = {v: k for k, v in enumerate(sorted(lv_states))}

        # Run EM algorithm
        estimator = EMSingleLatentVariable(
            sm=self.structure,
            data=data,
            lv_name=lv_name,
            node_states={n: sorted(s) for n, s in self.node_states.items()},
            initial_params=initial_params,
            box_constraints=box_constraints,
            priors=priors,
            non_missing_data_factor=non_missing_data_factor,
        )
        estimator.run(n_runs=n_runs, stopping_delta=stopping_delta)

        # Add CPDs into the model
        tab_cpds = [pd_to_tabular_cpd(el) for el in estimator.cpds.values()]
        self._model.add_cpds(*tab_cpds)

        return self

    def predict(self, data: pd.DataFrame, node: str) -> pd.DataFrame:
        """
        Predict the state of a node based on some input data, using the Bayesian Network.

        Args:
            data: data to make prediction.
            node: the node to predict.

        Returns:
            A dataframe of predictions, containing a single column name {node}_prediction.
        """
        if all(parent in data.columns for parent in self._model.get_parents(node)):
            return self._predict_from_complete_data(data, node)

        return self._predict_from_incomplete_data(data, node)

    def _predict_from_complete_data(
        self,
        data: pd.DataFrame,
        node: str,
    ) -> pd.DataFrame:
        """
        Predict state of node given all parents of node exist within data.
        This method inspects the CPD of node directly, since all parent states are known.
        This avoids traversing the full network to compute marginals.
        This method is fast.

        Args:
            data: data to make prediction.
            node: the node to predict.

        Returns:
            A dataframe of predictions, containing a single column named {node}_prediction.
        """
        transformed_data = data.copy(deep=True)  # type: pd.DataFrame

        parents = sorted(self._model.get_parents(node))
        cpd = self.cpds[node]

        transformed_data[f"{node}_prediction"] = transformed_data.apply(
            lambda row: cpd[tuple(row[parent] for parent in parents)].idxmax()
            if parents
            else cpd[""].idxmax(),
            axis=1,
        )
        return transformed_data[[node + "_prediction"]]

    def _predict_from_incomplete_data(
        self,
        data: pd.DataFrame,
        node: str,
    ) -> pd.DataFrame:
        """
        Predict state of node when some parents of node do not exist within data.
        This method uses the pgmpy predict function, which predicts the most likely state for every node
        that is not contained within data.
        With incomplete data, pgmpy goes beyond parents in the network to determine the most likely predictions.
        This method is slow.

        Args:
            data: data to make prediction.
            node: the node to predict.

        Returns:
            A dataframe of predictions, containing a single column name {node}_prediction.
        """
        transformed_data = data.copy(deep=True)  # type: pd.DataFrame
        self._state_to_index(transformed_data)

        # pgmpy will predict all missing data, so drop column we want to predict
        transformed_data = transformed_data.drop(columns=[node])
        predictions = self._model.predict(transformed_data)[[node]]

        return predictions.rename(columns={node: node + "_prediction"})

    def predict_probability(self, data: pd.DataFrame, node: str) -> pd.DataFrame:
        """
        Predict the probability of each possible state of a node, based on some input data.

        Args:
            data: data to make prediction.
            node: the node to predict probabilities.

        Returns:
            A dataframe of predicted probabilities, contained one column per possible state, named {node}_{state}.
        """
        if all(parent in data.columns for parent in self._model.get_parents(node)):
            return self._predict_probability_from_complete_data(data, node)

        return self._predict_probability_from_incomplete_data(data, node)

    def _predict_probability_from_complete_data(
        self,
        data: pd.DataFrame,
        node: str,
    ) -> pd.DataFrame:
        """
        Predict the probability of each possible state of a node, based on some input data.
        This method inspects the CPD of node directly, since all parent states are known.
        This avoids traversing the full network to compute marginals.
        This method is fast.

        Args:
            data: data to make prediction.
            node: the node to predict probabilities.

        Returns:
            A dataframe of predicted probabilities, contained one column per possible state, named {node}_{state}.
        """
        transformed_data = data.copy(deep=True)  # type: pd.DataFrame

        parents = sorted(self._model.get_parents(node))
        cpd = self.cpds[node]

        def lookup_probability(row, s):
            """Retrieve probability from CPD"""
            if parents:
                return cpd[tuple(row[parent] for parent in parents)].loc[s]
            return cpd.at[s, ""]

        for state in self.node_states[node]:
            transformed_data[f"{node}_{state}"] = transformed_data.apply(
                lambda row, st=state: lookup_probability(row, st), axis=1
            )

        return transformed_data[[f"{node}_{state}" for state in self.node_states[node]]]

    def _predict_probability_from_incomplete_data(
        self,
        data: pd.DataFrame,
        node: str,
    ) -> pd.DataFrame:
        """
        Predict the probability of each possible state of a node, based on some input data.
        This method uses the pgmpy predict_probability function, which predicts the probability
        of every state for every node that is not contained within data.
        With incomplete data, pgmpy goes beyond parents in the network to determine the most likely predictions.
        This method is slow.

        Args:
            data: data to make prediction.
            node: the node to predict probabilities.

        Returns:
            A dataframe of predicted probabilities, contained one column per possible state, named {node}_{state}.
        """
        transformed_data = data.copy(deep=True)  # type: pd.DataFrame
        self._state_to_index(transformed_data)

        # pgmpy will predict all missing data, so drop column we want to predict
        transformed_data = transformed_data.drop(columns=[node])

        probability = self._model.predict_probability(
            transformed_data
        )  # type: pd.DataFrame

        # keep only probabilities for the node we are interested in
        cols = []
        pattern = re.compile(f"^{node}_[0-9]+$")

        # disabled open pylint issue (https://github.com/PyCQA/pylint/issues/2962)
        for col in probability.columns:
            if pattern.match(col):
                cols.append(col)

        probability = probability[cols]
        probability.columns = cols
        return probability
assert qualityModel.check_model()

# %% codecell

# TODO how to get joint of C and N? P(C, N)? Need to marginalize somehow over their combined conditional variable L?
print(qualityModel.get_cpds('Cost'))
print(qualityModel.get_cpds('Number'))

# WAY 1: eliminating then mutliplying the marginalizations
elimQ = VariableElimination(qualityModel)
factorCost = elimQ.query(['Cost'])
factorNumber = elimQ.query(['Number'])

res = reduce(mul, [factorCost, factorNumber])
sum(sum(res.values))
print(res)

# WAY 2: condition on the same conditioning node and then do combinations of the other variables
qualityModel.get_parents(node='Cost')
qualityModel.get_parents(node='Number')

res2 = (reduce(mul, [
    qualityModel.get_cpds('Cost').to_factor(),
    qualityModel.get_cpds('Number').to_factor()
]).normalize(inplace=False))

print(
    res2.marginalize(variables=['Quality', 'Location'],
                     inplace=False).normalize(inplace=False))

# Answer: see the answer in the commented section of NetworkUtils file
Example #9
0
def bayesian_net():
    alarm_model = BayesianModel([
        ('Burglary',
         'Alarm'),  # Alarm has two parents, thus, it is twice as son.
        ('Earthquake', 'Alarm'),
        ('Alarm', 'JohnCalls'),
        ('Alarm', 'MaryCalls')
    ])
    for i in alarm_model.get_parents('Alarm'):
        print(i)

    # variable_card indicates the number of posible values this variable can take.

    cpd_burglary = TabularCPD(
        variable='Burglary',
        variable_card=2,  # 0->True, 1->False
        values=[
            [0.001],  # true probabilities of the table
            [0.999]
        ])  # false probabilities of the table
    cpd_earthquake = TabularCPD(
        variable='Earthquake',
        variable_card=2,  # 0->True 1->False
        values=[
            [0.002],  # true probabilities of the table
            [0.998]
        ])  # false probabilities of the table

    # evidence_card indicates the number of possible values the parents of the variable can take

    cpd_alarm = TabularCPD(
        variable='Alarm',
        variable_card=2,  # 0->True 1->False
        values=[
            [0.95, 0.94, 0.29, 0.001],  # true probabilities of the table
            [0.05, 0.06, 0.71, 0.999]
        ],  # false probabilities of the table
        evidence=['Burglary', 'Earthquake'],
        evidence_card=[2, 2])
    cpd_john_calls = TabularCPD(
        variable='JohnCalls',
        variable_card=2,  # 0->True 1->False
        values=[[0.95, 0.05], [0.05, 0.95]],
        evidence=['Alarm'],
        evidence_card=[2])

    cpd_mary_calls = TabularCPD(
        variable='MaryCalls',
        variable_card=2,  # 0->True 1->False
        values=[
            [0.7, 0.1],  # true probabilities of the table
            [0.3, 0.9]
        ],  # false probabilities of the table
        evidence=['Alarm'],
        evidence_card=[2])
    for i in [
            cpd_burglary, cpd_earthquake, cpd_alarm, cpd_john_calls,
            cpd_mary_calls
    ]:
        print(i)

    alarm_model.add_cpds(cpd_burglary, cpd_earthquake, cpd_alarm,
                         cpd_john_calls, cpd_mary_calls)
    alarm_model.check_model()

    infer = VariableElimination(alarm_model)

    # Uncomment to obtain the result before normalization
    # infer = SimpleInference(alarm_model)

    print(
        infer.query(
            ['JohnCalls'],
            evidence={
                'Burglary': 1,
                'Earthquake': 1,
                'Alarm': 0,
                'MaryCalls': 0
            },
        )['JohnCalls'])

    print(
        infer.query(['Burglary'], evidence={
            'JohnCalls': 0,
            'MaryCalls': 0
        })['Burglary'])

    # Variable order can be specified if necessary

    print(
        infer.query(['Burglary'],
                    evidence={
                        'JohnCalls': 0,
                        'MaryCalls': 0
                    },
                    elimination_order=['Alarm', 'Earthquake'])['Burglary'])

    sampling = BayesianModelSampling(alarm_model)
    data = sampling.rejection_sample(evidence={},
                                     size=20,
                                     return_type="dataframe")
    print(data)

    data = sampling.rejection_sample(evidence=[('JohnCalls', 0),
                                               ('MaryCalls', 0)],
                                     size=20,
                                     return_type='dataframe')
    print(data)

    sampling = BayesianModelSampling(alarm_model)
    data = sampling.rejection_sample(evidence=None,
                                     size=5000,
                                     return_type="dataframe")
    approx_alarm_model = BayesianModel([('Burglary', 'Alarm'),
                                        ('Earthquake', 'Alarm'),
                                        ('Alarm', 'JohnCalls'),
                                        ('Alarm', 'MaryCalls')])
    approx_alarm_model.fit(data, estimator=BayesianEstimator)
    approx_alarm_model.check_model()

    for cpd in approx_alarm_model.get_cpds():
        print("CPD of {variable}:".format(variable=cpd.variable))
        print(cpd)

    infer = VariableElimination(approx_alarm_model)

    print(
        infer.query(
            ['JohnCalls'],
            evidence={
                'Burglary': 1,
                'Earthquake': 1,
                'Alarm': 0,
                'MaryCalls': 0
            },
        )['JohnCalls'])

    print(
        infer.query(['Burglary'], evidence={
            'JohnCalls': 0,
            'MaryCalls': 0
        })['Burglary'])

    print(
        alarm_model.predict_probability(
            data[['Burglary', 'Earthquake', 'Alarm', 'JohnCalls']]))
Example #10
0
path_node.transition = None
noisy_path_node.transition = None

# CREATING BAYESIAN MODEL (GRAPH)
cusumano_model = BayesianModel([(start_node, path_node),
                                (goal_node, path_node),
                                (path_node, noisy_path_node)])

# ATTENTION!!! =)
# Test functions. If these do not work you probably need to check python requirements.txt:
# this was tested using networkx 2.3 and pgmpy 0.1.7 from dev branch in github.
print("Root nodes: %s" % cusumano_model.get_roots())
print("Children of 'start': %s" % cusumano_model.get_children(start_node))
print("Leaf nodes: %s" % cusumano_model.get_leaves())
print("Parents of 'noisy_path': %s" %
      cusumano_model.get_parents(noisy_path_node))
# FIXME: not working: we get error 'RandomChoice' object is not iterable'.
# print(cusumano_model.get_independencies())

nx.draw_networkx(cusumano_model,
                 arrowsize=15,
                 node_size=800,
                 node_color='#90b9f9')
plt.show()


# Defines whether to accept or reject the new sample
def acceptance(x, x_new):
    if x_new > x:
        return True
    else: