def Hybrid(dataset: pd.DataFrame):
    from pgmpy.estimators import MmhcEstimator
    from pgmpy.estimators import HillClimbSearch
    from pgmpy.estimators import BDeuScore, K2Score, BicScore
    from pgmpy.models import BayesianModel
    
    mmhc = MmhcEstimator(dataset)
    # The mmhc method takes a parameter significance_level(default=0.01) the desired Type 1 error probability of
    # falsely rejecting the null hypothesis that variables. That is, confining Type 1 error rate.
    # (Therefore, the lower value, the less we are gonna accept dependencies, resulting in a sparser graph.)
    skeleton = mmhc.mmpc()
    print("Part 1) Skeleton: ", skeleton.edges())

    # use hill climb search to orient the edges:
    hc = HillClimbSearch(dataset, scoring_method=BDeuScore(dataset, equivalent_sample_size=5))
    # Recording the evaluation of different iteration
    bdeu = BDeuScore(dataset, equivalent_sample_size=5)
    iter_list = [2**i for i in range(20)]
    eval_list = []
    for iteration in iter_list:
        DAG_connection = hc.estimate(tabu_length=10, white_list=skeleton.to_directed().edges(), max_iter=iteration)
        model = BayesianModel(DAG_connection.edges())
        print(bdeu.score(model))
        eval_list.append(bdeu.score(model))

    print("Part 2) Model:    ", model.edges())
    return model.edges(), [iter_list, eval_list]
def Hill_Climbing(dataset: pd.DataFrame):
    # from pgmpy.estimators import ExhaustiveSearch
    from pgmpy.estimators import HillClimbSearch
    from pgmpy.estimators import BDeuScore, K2Score, BicScore
    from pgmpy.models import BayesianModel

    bdeu = BDeuScore(dataset, equivalent_sample_size=5)
    
    hc = HillClimbSearch(dataset, scoring_method=BDeuScore(dataset, equivalent_sample_size=5))
    iter_list = [2**i for i in range(20)]
    eval_list = []
    for iteration in iter_list:
        DAG_connection = hc.estimate(tabu_length=10, max_iter=iteration)
        model = BayesianModel(DAG_connection.edges())
        print(bdeu.score(model))
        eval_list.append(bdeu.score(model))
    
    return model.edges(), [iter_list, eval_list]
Exemple #3
0
def _SetScoringType(df, scoretype, verbose=3):
    if verbose >= 3: print('[bnlearn] >Set scoring type at [%s]' % (scoretype))

    if scoretype == 'bic':
        scoring_method = BicScore(df)
    elif scoretype == 'k2':
        scoring_method = K2Score(df)
    elif scoretype == 'bdeu':
        scoring_method = BDeuScore(df, equivalent_sample_size=5)

    return (scoring_method)
Exemple #4
0
    def estimate(self,
                 scoring_method=None,
                 tabu_length=10,
                 significance_level=0.01):
        if scoring_method is None:
            scoring_method = BDeuScore(self.data, equivalent_sample_size=10)

        skel = self.mmpc(significance_level)

        hc = HillClimbSearch(self.data, scoring_method=scoring_method)

        model = hc.estimate(white_list=skel.to_directed().edges(),
                            tabu_length=tabu_length)

        return model
Exemple #5
0
    def __init__(self, fileName, alpha=1):
        """ Build the empty graph model with n=data.shape[1] vertices. Set other attributes for use in model learning.
        Args:
            :param fileName (string): The path of the .csv file containing the data.
            :param alpha (int): the equivalent sample size of the Dirichlet uniform prior.
        """

        self.alpha = alpha
        self.data = np.genfromtxt(fileName, delimiter=',')
        self.num_vars = self.data.shape[1]
        self.data_frame = pd.DataFrame(self.data, columns=list(range(self.num_vars)))
        self.var_states = []

        for index in range(len(self.data_frame.columns)):
            self.var_states.append(self.data_frame[index].nunique())

        self.bdeu = BDeuScore(self.data_frame, equivalent_sample_size=alpha)

        self.undirected = nx.Graph()
        self.undirected.add_nodes_from(range(self.num_vars))

        self.maxtree = self.undirected.copy()

        self.directed = BayesianModel()
Exemple #6
0
def forward_stepwise_selection(
    current_nodes: typing.Sequence[str],
    next_nodes: typing.Sequence[str],
    indegree: int,
    estimator: BDeuScore,
    verbose: bool = False,
) -> typing.Tuple[BayesianModel, float]:
    assert indegree <= len(
        current_nodes
    ), "Indegree should not be greater than the number of nodes in a"

    start_time = time.time()

    # Initialize model with continuity constraints
    continuity_constraints = list(zip(current_nodes, next_nodes))
    model = BayesianModel(continuity_constraints)

    # Create support dictionary to find node ICX_t from ICX_t+1
    node_map = dict([(n, c) for c, n, in continuity_constraints])

    # Get number of available cores
    n_procs = psutil.cpu_count(logical=False)
    if verbose:
        print(f"Using {n_procs} cores")
        print("-" * 50)

    # If indegree is equal to the total number of nodes in the current slice,
    # speed up the search by simply constructing a dense network
    if indegree == len(current_nodes):
        if verbose:
            print("Building dense network...")
        edges = list(itertools.product(current_nodes, next_nodes))
        model.add_edges_from(edges)
    else:
        # Scan next nodes
        for x_n in next_nodes:
            if verbose:
                print(f"Finding parents of node {x_n}...")

            # Keep track of already added nodes to avoid repetitions
            # Take continuity constraints into account in the added nodes
            added_nodes = set()
            added_nodes.add(node_map[x_n])
            # Loop is repeated (indegree - 1) times since one parent node
            # has already been added due to continuity constraints
            for _ in range(indegree - 1):
                # Consider as testable only non-looping nodes
                testable_nodes = sorted(
                    set(current_nodes).difference(
                        added_nodes))  # sort for reproducibility

                # Use multiprocessing to speed-up search
                with ProcessPoolExecutor(max_workers=n_procs) as executor:
                    # Fix estimator, model and x_n parameters and feed the partial function to the executor
                    edge_eval_par = functools.partial(edge_eval,
                                                      estimator=estimator,
                                                      model=model,
                                                      x_n=x_n)
                    scores = dict(
                        zip(testable_nodes,
                            executor.map(edge_eval_par, testable_nodes)))
                node_to_add = max(scores, key=scores.get)
                best_score = scores[node_to_add]
                model.add_edge(node_to_add, x_n)
                added_nodes.add(node_to_add)
                if verbose:
                    print(f"\t Added {node_to_add} with {best_score:.2f}")
    final_score = estimator.score(model)
    if verbose:
        print("-" * 50)
        print(f"Running time: {time.time() - start_time:.2f} s")
    return model, final_score
Exemple #7
0
def edge_eval(x_c: str, x_n: str, model: BayesianModel,
              estimator: BDeuScore) -> float:
    # Define function for evaluating each edge connecting to X_n
    copy = model.copy()
    copy.add_edge(x_c, x_n)
    return estimator.score(copy)
Exemple #8
0
    def opt(self, file1, file2):
        f1 = open(file1, encoding="utf8")
        lines = f1.readlines()
        nodes = self.getegdes(lines[0])
        edges = self.getegdes(lines[1])
        data = pd.read_csv(file2)

        G = BayesianModel()
        G.add_nodes_from(nodes)
        for i in range(int(len(edges) / 2)):
            G.add_edge(edges[2 * i], edges[2 * i + 1])
        # nx.draw(G)
        # plt.show()
        k2 = K2Score(data).score(G)
        bic = BicScore(data).score(G)
        bdeu = BDeuScore(data).score(G)
        print(k2, ",", bic, ",", bdeu)

        est = HillClimbSearch(data, scoring_method=K2Score(data))
        model = est.estimate()
        model_edges = model.edges()
        G_ = nx.DiGraph()
        G_.add_edges_from(model_edges)
        G_copy = nx.DiGraph()
        G_copy.add_edges_from(G.edges)
        add = []
        add_mut = []
        delete = []
        delete_mut = []
        # a = list(G.edges._adjdict.key())
        for edge in model_edges:
            node1 = edge[0]
            node2 = edge[1]
            if not nx.has_path(G, node2, node1):
                if not G.has_edge(node1, node2):
                    this = (node1, node2)
                    # this = '('+node1+','+node2+')'
                    add.append(this)
                    x = data[node1]
                    mut = mr.mutual_info_score(data[node1], data[node2])
                    add_mut.append(mut)
        seq = list(zip(add_mut, add))
        seq = sorted(seq, key=lambda s: s[0], reverse=True)
        alpha = 0.015
        # if seq[0][0] > alpha:
        #     add = seq[0:1]

        add = seq[0:1]

        data_edges = []
        for edge in G.edges:
            node1 = edge[0]
            node2 = edge[1]
            mut = mr.mutual_info_score(data[node1], data[node2])
            delete_mut.append(mut)
            data_edges.append(edge)
            # if not (nx.has_path(G_, node1, node2) or nx.has_path(G_, node2, node1)):
            #     this = '('+node1+','+node2+')'
            #     delete.append(this)
        seq = list(zip(delete_mut, data_edges))
        seq = sorted(seq, key=lambda s: s[0])

        # if seq[0][0] < alpha:
        #     delete = seq[0:1]
        if len(edges) > 2:
            delete = seq[0:1]
            if len(add) > 0:
                if delete[0][0] > add[0][0]:
                    delete = []

        print('add')
        for i in add:
            print(str(i[1]) + "," + str(i[0]))

        print('delete')
        for j in delete:
            print(str(j[1]) + "," + str(j[0]))
            # print(j[0])

        print('cpt')
        estimator = BayesianEstimator(G, data)
        for i in G.nodes:
            cpd = estimator.estimate_cpd(i, prior_type="K2")
            nodeName = i
            values = dict(data[i].value_counts())
            valueNum = len(values)
            CPT = np.transpose(cpd.values)
            # CPT = cpd.values
            sequence = cpd.variables[1::]
            card = []
            for x in sequence:
                s = len(dict(data[x].value_counts()))
                card.append(s)
            output = nodeName + '\t' + str(valueNum) + '\t' + str(
                CPT.tolist()) + '\t' + str(sequence) + '\t' + str(card)
            print(output)

        print('mutual')
        output1 = []
        for i in range(int(len(edges) / 2)):
            mut = mr.mutual_info_score(data[edges[2 * i]],
                                       data[edges[2 * i + 1]])
            output1.append(mut)
        output2 = {}
        for node1 in G.nodes():
            d = {}
            for node2 in G.nodes():
                if node1 == node2:
                    continue
                mut = mr.mutual_info_score(data[node1], data[node2])

                d[node2] = mut
            output2[node1] = d
        print(output1)
        print(output2)
# **Example 1:** $Z = X + Y$
# %% codecell
from pgmpy.estimators import BDeuScore, K2Score, BicScore

# Create random data sample with 3 variables, where Z is dependent on X, Y:
data: DataFrame = DataFrame(data=np.random.randint(low=0,
                                                   high=4,
                                                   size=(5000, 2)),
                            columns=list('XY'))

# Making Z dependent (in some arbitrary relation like addition) on X and Y
data['Z'] = data['X'] + data['Y']

# %% codecell
# Creating the scoring objects from this data:
bdeu: BDeuScore = BDeuScore(data, equivalent_sample_size=5)
k2: K2Score = K2Score(data=data)
bic: BicScore = BicScore(data=data)

# %% codecell
commonEvidenceModel: BayesianModel = BayesianModel([('X', 'Z'), ('Y', 'Z')])
drawGraph(commonEvidenceModel)
# %% codecell
commonCauseModel: BayesianModel = BayesianModel([('X', 'Z'), ('X', 'Y')])
drawGraph(commonCauseModel)

# %% codecell
bdeu.score(commonEvidenceModel)
# %% codecell
k2.score(commonEvidenceModel)
# %% codecell