def estimate(self): """ Estimates the `DAG` structure that fits best to the given data set, according to the scoring method supplied in the constructor. Exhaustively searches through all models. Only estimates network structure, no parametrization. Returns ------- model: `DAG` instance A `DAG` with maximal score. Examples -------- >>> import pandas as pd >>> import numpy as np >>> from pgmpy.estimators import ExhaustiveSearch >>> # create random data sample with 3 variables, where B and C are identical: >>> data = pd.DataFrame(np.random.randint(0, 5, size=(5000, 2)), columns=list('AB')) >>> data['C'] = data['B'] >>> est = ExhaustiveSearch(data) >>> best_model = est.estimate() >>> best_model <pgmpy.base.DAG.DAG object at 0x7f695c535470> >>> best_model.edges() [('B', 'C')] """ best_dag = max(self.all_dags(), key=self.scoring_method.score) best_model = DAG() best_model.add_nodes_from(sorted(best_dag.nodes())) best_model.add_edges_from(sorted(best_dag.edges())) return best_model
def estimate(self, tabu_length=100, max_indegree=2, black_list=None, epsilon=1e-4, max_iter=1e6, show_progress=True): # We will be using K2Score for this model score = K2Score(data=self.data) # Model gets the score for a node and its parents # This is used on every iteration for all possible changes # This is greddy and picks the best available option score_fn = score.local_score # Initialize a Starting DAG # PGMPY made a DAG class that adds some functionality to nx.DiGrpah start_dag = DAG() start_dag.add_nodes_from(self.variables) # Set the edges we do not want to have in the graph if black_list is None: black_list = set() else: black_list = set(black_list) # Just change Maxindegree to a certain number when doing the model # I think this is to keep track of the changes we already made to the model tabu_list = deque(maxlen=tabu_length) # Initialize a current model current_model = start_dag if show_progress: iteration = trange(int(max_iter)) else: iteration = range(int(max_iter)) for _ in iteration: # Get the best operations based on K2 score with self._legal_operations best_operation, best_score_change = max(self._legal_operations( model=current_model, score=score_fn, tabu_list=tabu_list, max_indegree=max_indegree, black_list=black_list, ), key=lambda t: t[1]) if best_score_change < epsilon: break elif best_operation[0] == '+': current_model.add_edge(*best_operation[1]) tabu_list.append(("-", best_operation[1])) elif best_operation[0] == '-': current_model.remove_edge(*best_operation[1]) tabu_list.append(("+", best_operation[1])) elif best_operation[0] == 'flip': X, Y = best_operation[1] current_model.remove_edge(X, Y) current_model.add_edge(Y, X) tabu_list.append(best_operation) return current_model
def random_dag(number_of_nodes: int = 5, edge_density: float = 0.4, max_in_degree: int = 4) -> DAG: """Create a connected, random directed acyclic graph (DAG), with the given number of nodes, the given edge density, and with no node exceeding having too high in degree""" node_names = [f"X{i}" for i in range(number_of_nodes)] dag = DAG() # First make sure the dag is connected visited = list() unvisited = list(node_names) node = random.choice(unvisited) unvisited.remove(node) visited.append(node) dag.add_node(node) while unvisited: node = random.choice(unvisited) neighbor = random.choice(visited) if node_names.index(node) < node_names.index( neighbor) and dag.in_degree(neighbor) < max_in_degree: dag.add_edge(node, neighbor) elif node_names.index(neighbor) < node_names.index(node): dag.add_edge(neighbor, node) else: continue unvisited.remove(node) visited.append(node) # Then add edges until desired density is reached maximum_number_of_edges = number_of_nodes * (number_of_nodes - 1) / 2 while dag.number_of_edges() < int(edge_density * maximum_number_of_edges): add_random_edge(dag, node_names) return dag
def pdag2dag(self, edge_dict): pdag_edges = [(pi, n) for n, p in edge_dict.items() for pi in p] pdag = DAG(pdag_edges) dag_edges = ConstraintBasedEstimator.pdag_to_dag(pdag).edges() dag = dict([(n, set()) for n in range(len(edge_dict))]) for e in dag_edges: dag[e[1]].add(e[0]) return dag
def test_markov_blanet(self): G = DAG([ ("x", "y"), ("z", "y"), ("y", "w"), ("y", "v"), ("u", "w"), ("s", "v"), ("w", "t"), ("w", "m"), ("v", "n"), ("v", "q"), ]) self.assertEqual(set(G.get_markov_blanket("y")), set(["s", "w", "x", "u", "z", "v"]))
def estimate( self, start=None, tabu_length=0, max_indegree=None, epsilon=1e-4, max_iter=1e6 ): """ Performs local hill climb search to estimates the `DAG` structure that has optimal score, according to the scoring method supplied in the constructor. Starts at model `start` and proceeds by step-by-step network modifications until a local maximum is reached. Only estimates network structure, no parametrization. Parameters ---------- start: DAG instance The starting point for the local search. By default a completely disconnected network is used. tabu_length: int If provided, the last `tabu_length` graph modifications cannot be reversed during the search procedure. This serves to enforce a wider exploration of the search space. Default value: 100. max_indegree: int or None If provided and unequal None, the procedure only searches among models where all nodes have at most `max_indegree` parents. Defaults to None. epsilon: float (default: 1e-4) Defines the exit condition. If the improvement in score is less than `epsilon`, the learned model is returned. max_iter: int (default: 1e6) The maximum number of iterations allowed. Returns the learned model when the number of iterations is greater than `max_iter`. Returns ------- model: `DAG` instance A `DAG` at a (local) score maximum. Examples -------- >>> import pandas as pd >>> import numpy as np >>> from pgmpy.estimators import HillClimbSearch, BicScore >>> # create data sample with 9 random variables: ... data = pd.DataFrame(np.random.randint(0, 5, size=(5000, 9)), columns=list('ABCDEFGHI')) >>> # add 10th dependent variable ... data['J'] = data['A'] * data['B'] >>> est = HillClimbSearch(data, scoring_method=BicScore(data)) >>> best_model = est.estimate() >>> sorted(best_model.nodes()) ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'] >>> best_model.edges() [('B', 'J'), ('A', 'J')] >>> # search a model with restriction on the number of parents: >>> est.estimate(max_indegree=1).edges() [('J', 'A'), ('B', 'J')] """ nodes = self.state_names.keys() if start is None: start = DAG() start.add_nodes_from(nodes) elif not isinstance(start, DAG) or not set(start.nodes()) == set(nodes): raise ValueError( "'start' should be a DAG with the same variables as the data set, or 'None'." ) tabu_list = [] current_model = start iter_no = 0 while iter_no <= max_iter: iter_no += 1 best_score_delta = 0 best_operation = None for operation, score_delta in self._legal_operations( current_model, tabu_list, max_indegree ): if score_delta > best_score_delta: best_operation = operation best_score_delta = score_delta if best_operation is None or best_score_delta < epsilon: break elif best_operation[0] == "+": current_model.add_edge(*best_operation[1]) tabu_list = ([("-", best_operation[1])] + tabu_list)[:tabu_length] elif best_operation[0] == "-": current_model.remove_edge(*best_operation[1]) tabu_list = ([("+", best_operation[1])] + tabu_list)[:tabu_length] elif best_operation[0] == "flip": X, Y = best_operation[1] current_model.remove_edge(X, Y) current_model.add_edge(Y, X) tabu_list = ([best_operation] + tabu_list)[:tabu_length] return current_model
def setUp(self): self.G = BayesianModel([("d", "g"), ("i", "g"), ("g", "l"), ("i", "s")]) self.G2 = DAG([("d", "g"), ("i", "g"), ("g", "l"), ("i", "s")])
def setUp(self): self.graph = DAG() self.graph.add_edges_from([("X", "A"), ("A", "Y"), ("A", "B")])
def test_class_init_with_data_string(self): self.graph = DAG([("a", "b"), ("b", "c")]) self.assertListEqual(sorted(self.graph.nodes()), ["a", "b", "c"]) self.assertListEqual( hf.recursive_sorted(self.graph.edges()), [["a", "b"], ["b", "c"]] )
def setUp(self): self.graph = DAG() self.graph.add_edges_from([("diff", "grade"), ("intel", "grade")])
def test_update_node_parents_bm_constructor(self): self.graph = DAG([("a", "b"), ("b", "c")]) self.assertListEqual(list(self.graph.predecessors("a")), []) self.assertListEqual(list(self.graph.predecessors("b")), ["a"]) self.assertListEqual(list(self.graph.predecessors("c")), ["b"])
def setUp(self): self.graph = DAG()
def pdag_to_dag(pdag): """Completes a PDAG to a DAG, without adding v-structures, if such a completion exists. If no faithful extension is possible, some fully oriented DAG that corresponds to the PDAG is returned and a warning is generated. This is a static method. Parameters ---------- pdag: DAG A directed acyclic graph pattern, consisting in (acyclic) directed edges as well as "undirected" edges, represented as both-way edges between nodes. Returns ------- dag: DAG A faithful orientation of pdag, if one exists. Otherwise any fully orientated DAG/BayesianModel with the structure of pdag. References ---------- [1] Chickering, Learning Equivalence Classes of Bayesian-Network Structures, 2002; See page 454 (last paragraph) for the algorithm pdag_to_dag http://www.jmlr.org/papers/volume2/chickering02a/chickering02a.pdf [2] Dor & Tarsi, A simple algorithm to construct a consistent extension of a partially oriented graph, 1992, http://ftp.cs.ucla.edu/pub/stat_ser/r185-dor-tarsi.pdf Examples -------- >>> import pandas as pd >>> import numpy as np >>> from pgmpy.base import DAG >>> from pgmpy.estimators import ConstraintBasedEstimator >>> data = pd.DataFrame(np.random.randint(0, 4, size=(5000, 3)), columns=list('ABD')) >>> data['C'] = data['A'] - data['B'] >>> data['D'] += data['A'] >>> c = ConstraintBasedEstimator(data) >>> pdag = c.skeleton_to_pdag(*c.estimate_skeleton()) >>> pdag.edges() [('B', 'C'), ('D', 'A'), ('A', 'D'), ('A', 'C')] >>> c.pdag_to_dag(pdag).edges() [('B', 'C'), ('A', 'D'), ('A', 'C')] >>> # pdag_to_dag is static: ... pdag1 = DAG([('A', 'B'), ('C', 'B'), ('C', 'D'), ('D', 'C'), ('D', 'A'), ('A', 'D')]) >>> ConstraintBasedEstimator.pdag_to_dag(pdag1).edges() [('D', 'C'), ('C', 'B'), ('A', 'B'), ('A', 'D')] >>> # example of a pdag with no faithful extension: ... pdag2 = DAG([('A', 'B'), ('A', 'C'), ('B', 'C'), ('C', 'B')]) >>> ConstraintBasedEstimator.pdag_to_dag(pdag2).edges() UserWarning: PDAG has no faithful extension (= no oriented DAG with the same v-structures as PDAG). Remaining undirected PDAG edges oriented arbitrarily. [('B', 'C'), ('A', 'B'), ('A', 'C')] """ pdag = pdag.copy() dag = DAG() dag.add_nodes_from(pdag.nodes()) # add already directed edges of pdag to dag for X, Y in pdag.edges(): if not pdag.has_edge(Y, X): dag.add_edge(X, Y) while pdag.number_of_nodes() > 0: # find node with (1) no directed outgoing edges and # (2) the set of undirected neighbors is either empty or # undirected neighbors + parents of X are a clique found = False for X in pdag.nodes(): directed_outgoing_edges = set(pdag.successors(X)) - set( pdag.predecessors(X)) undirected_neighbors = set(pdag.successors(X)) & set( pdag.predecessors(X)) neighbors_are_clique = all((pdag.has_edge(Y, Z) for Z in pdag.predecessors(X) for Y in undirected_neighbors if not Y == Z)) if not directed_outgoing_edges and (not undirected_neighbors or neighbors_are_clique): found = True # add all edges of X as outgoing edges to dag for Y in pdag.predecessors(X): dag.add_edge(Y, X) pdag.remove_node(X) break if not found: warn( "PDAG has no faithful extension (= no oriented DAG with the " + "same v-structures as PDAG). Remaining undirected PDAG edges " + "oriented arbitrarily.") for X, Y in pdag.edges(): if not dag.has_edge(Y, X): try: dag.add_edge(X, Y) except ValueError: pass break return dag