def Hybrid(dataset: pd.DataFrame): from pgmpy.estimators import MmhcEstimator from pgmpy.estimators import HillClimbSearch from pgmpy.estimators import BDeuScore, K2Score, BicScore from pgmpy.models import BayesianModel mmhc = MmhcEstimator(dataset) # The mmhc method takes a parameter significance_level(default=0.01) the desired Type 1 error probability of # falsely rejecting the null hypothesis that variables. That is, confining Type 1 error rate. # (Therefore, the lower value, the less we are gonna accept dependencies, resulting in a sparser graph.) skeleton = mmhc.mmpc() print("Part 1) Skeleton: ", skeleton.edges()) # use hill climb search to orient the edges: hc = HillClimbSearch(dataset, scoring_method=BDeuScore(dataset, equivalent_sample_size=5)) # Recording the evaluation of different iteration bdeu = BDeuScore(dataset, equivalent_sample_size=5) iter_list = [2**i for i in range(20)] eval_list = [] for iteration in iter_list: DAG_connection = hc.estimate(tabu_length=10, white_list=skeleton.to_directed().edges(), max_iter=iteration) model = BayesianModel(DAG_connection.edges()) print(bdeu.score(model)) eval_list.append(bdeu.score(model)) print("Part 2) Model: ", model.edges()) return model.edges(), [iter_list, eval_list]
def Hill_Climbing(dataset: pd.DataFrame): # from pgmpy.estimators import ExhaustiveSearch from pgmpy.estimators import HillClimbSearch from pgmpy.estimators import BDeuScore, K2Score, BicScore from pgmpy.models import BayesianModel bdeu = BDeuScore(dataset, equivalent_sample_size=5) hc = HillClimbSearch(dataset, scoring_method=BDeuScore(dataset, equivalent_sample_size=5)) iter_list = [2**i for i in range(20)] eval_list = [] for iteration in iter_list: DAG_connection = hc.estimate(tabu_length=10, max_iter=iteration) model = BayesianModel(DAG_connection.edges()) print(bdeu.score(model)) eval_list.append(bdeu.score(model)) return model.edges(), [iter_list, eval_list]
def _SetScoringType(df, scoretype, verbose=3): if verbose >= 3: print('[bnlearn] >Set scoring type at [%s]' % (scoretype)) if scoretype == 'bic': scoring_method = BicScore(df) elif scoretype == 'k2': scoring_method = K2Score(df) elif scoretype == 'bdeu': scoring_method = BDeuScore(df, equivalent_sample_size=5) return (scoring_method)
def estimate(self, scoring_method=None, tabu_length=10, significance_level=0.01): if scoring_method is None: scoring_method = BDeuScore(self.data, equivalent_sample_size=10) skel = self.mmpc(significance_level) hc = HillClimbSearch(self.data, scoring_method=scoring_method) model = hc.estimate(white_list=skel.to_directed().edges(), tabu_length=tabu_length) return model
def __init__(self, fileName, alpha=1): """ Build the empty graph model with n=data.shape[1] vertices. Set other attributes for use in model learning. Args: :param fileName (string): The path of the .csv file containing the data. :param alpha (int): the equivalent sample size of the Dirichlet uniform prior. """ self.alpha = alpha self.data = np.genfromtxt(fileName, delimiter=',') self.num_vars = self.data.shape[1] self.data_frame = pd.DataFrame(self.data, columns=list(range(self.num_vars))) self.var_states = [] for index in range(len(self.data_frame.columns)): self.var_states.append(self.data_frame[index].nunique()) self.bdeu = BDeuScore(self.data_frame, equivalent_sample_size=alpha) self.undirected = nx.Graph() self.undirected.add_nodes_from(range(self.num_vars)) self.maxtree = self.undirected.copy() self.directed = BayesianModel()
def forward_stepwise_selection( current_nodes: typing.Sequence[str], next_nodes: typing.Sequence[str], indegree: int, estimator: BDeuScore, verbose: bool = False, ) -> typing.Tuple[BayesianModel, float]: assert indegree <= len( current_nodes ), "Indegree should not be greater than the number of nodes in a" start_time = time.time() # Initialize model with continuity constraints continuity_constraints = list(zip(current_nodes, next_nodes)) model = BayesianModel(continuity_constraints) # Create support dictionary to find node ICX_t from ICX_t+1 node_map = dict([(n, c) for c, n, in continuity_constraints]) # Get number of available cores n_procs = psutil.cpu_count(logical=False) if verbose: print(f"Using {n_procs} cores") print("-" * 50) # If indegree is equal to the total number of nodes in the current slice, # speed up the search by simply constructing a dense network if indegree == len(current_nodes): if verbose: print("Building dense network...") edges = list(itertools.product(current_nodes, next_nodes)) model.add_edges_from(edges) else: # Scan next nodes for x_n in next_nodes: if verbose: print(f"Finding parents of node {x_n}...") # Keep track of already added nodes to avoid repetitions # Take continuity constraints into account in the added nodes added_nodes = set() added_nodes.add(node_map[x_n]) # Loop is repeated (indegree - 1) times since one parent node # has already been added due to continuity constraints for _ in range(indegree - 1): # Consider as testable only non-looping nodes testable_nodes = sorted( set(current_nodes).difference( added_nodes)) # sort for reproducibility # Use multiprocessing to speed-up search with ProcessPoolExecutor(max_workers=n_procs) as executor: # Fix estimator, model and x_n parameters and feed the partial function to the executor edge_eval_par = functools.partial(edge_eval, estimator=estimator, model=model, x_n=x_n) scores = dict( zip(testable_nodes, executor.map(edge_eval_par, testable_nodes))) node_to_add = max(scores, key=scores.get) best_score = scores[node_to_add] model.add_edge(node_to_add, x_n) added_nodes.add(node_to_add) if verbose: print(f"\t Added {node_to_add} with {best_score:.2f}") final_score = estimator.score(model) if verbose: print("-" * 50) print(f"Running time: {time.time() - start_time:.2f} s") return model, final_score
def edge_eval(x_c: str, x_n: str, model: BayesianModel, estimator: BDeuScore) -> float: # Define function for evaluating each edge connecting to X_n copy = model.copy() copy.add_edge(x_c, x_n) return estimator.score(copy)
def opt(self, file1, file2): f1 = open(file1, encoding="utf8") lines = f1.readlines() nodes = self.getegdes(lines[0]) edges = self.getegdes(lines[1]) data = pd.read_csv(file2) G = BayesianModel() G.add_nodes_from(nodes) for i in range(int(len(edges) / 2)): G.add_edge(edges[2 * i], edges[2 * i + 1]) # nx.draw(G) # plt.show() k2 = K2Score(data).score(G) bic = BicScore(data).score(G) bdeu = BDeuScore(data).score(G) print(k2, ",", bic, ",", bdeu) est = HillClimbSearch(data, scoring_method=K2Score(data)) model = est.estimate() model_edges = model.edges() G_ = nx.DiGraph() G_.add_edges_from(model_edges) G_copy = nx.DiGraph() G_copy.add_edges_from(G.edges) add = [] add_mut = [] delete = [] delete_mut = [] # a = list(G.edges._adjdict.key()) for edge in model_edges: node1 = edge[0] node2 = edge[1] if not nx.has_path(G, node2, node1): if not G.has_edge(node1, node2): this = (node1, node2) # this = '('+node1+','+node2+')' add.append(this) x = data[node1] mut = mr.mutual_info_score(data[node1], data[node2]) add_mut.append(mut) seq = list(zip(add_mut, add)) seq = sorted(seq, key=lambda s: s[0], reverse=True) alpha = 0.015 # if seq[0][0] > alpha: # add = seq[0:1] add = seq[0:1] data_edges = [] for edge in G.edges: node1 = edge[0] node2 = edge[1] mut = mr.mutual_info_score(data[node1], data[node2]) delete_mut.append(mut) data_edges.append(edge) # if not (nx.has_path(G_, node1, node2) or nx.has_path(G_, node2, node1)): # this = '('+node1+','+node2+')' # delete.append(this) seq = list(zip(delete_mut, data_edges)) seq = sorted(seq, key=lambda s: s[0]) # if seq[0][0] < alpha: # delete = seq[0:1] if len(edges) > 2: delete = seq[0:1] if len(add) > 0: if delete[0][0] > add[0][0]: delete = [] print('add') for i in add: print(str(i[1]) + "," + str(i[0])) print('delete') for j in delete: print(str(j[1]) + "," + str(j[0])) # print(j[0]) print('cpt') estimator = BayesianEstimator(G, data) for i in G.nodes: cpd = estimator.estimate_cpd(i, prior_type="K2") nodeName = i values = dict(data[i].value_counts()) valueNum = len(values) CPT = np.transpose(cpd.values) # CPT = cpd.values sequence = cpd.variables[1::] card = [] for x in sequence: s = len(dict(data[x].value_counts())) card.append(s) output = nodeName + '\t' + str(valueNum) + '\t' + str( CPT.tolist()) + '\t' + str(sequence) + '\t' + str(card) print(output) print('mutual') output1 = [] for i in range(int(len(edges) / 2)): mut = mr.mutual_info_score(data[edges[2 * i]], data[edges[2 * i + 1]]) output1.append(mut) output2 = {} for node1 in G.nodes(): d = {} for node2 in G.nodes(): if node1 == node2: continue mut = mr.mutual_info_score(data[node1], data[node2]) d[node2] = mut output2[node1] = d print(output1) print(output2)
# **Example 1:** $Z = X + Y$ # %% codecell from pgmpy.estimators import BDeuScore, K2Score, BicScore # Create random data sample with 3 variables, where Z is dependent on X, Y: data: DataFrame = DataFrame(data=np.random.randint(low=0, high=4, size=(5000, 2)), columns=list('XY')) # Making Z dependent (in some arbitrary relation like addition) on X and Y data['Z'] = data['X'] + data['Y'] # %% codecell # Creating the scoring objects from this data: bdeu: BDeuScore = BDeuScore(data, equivalent_sample_size=5) k2: K2Score = K2Score(data=data) bic: BicScore = BicScore(data=data) # %% codecell commonEvidenceModel: BayesianModel = BayesianModel([('X', 'Z'), ('Y', 'Z')]) drawGraph(commonEvidenceModel) # %% codecell commonCauseModel: BayesianModel = BayesianModel([('X', 'Z'), ('X', 'Y')]) drawGraph(commonCauseModel) # %% codecell bdeu.score(commonEvidenceModel) # %% codecell k2.score(commonEvidenceModel) # %% codecell