Exemple #1
0
def bayesnet_examples():
    from pgmpy.factors import TabularCPD
    from pgmpy.models import BayesianModel
    import pandas as pd

    student_model = BayesianModel([('D', 'G'),
                                   ('I', 'G'),
                                   ('G', 'L'),
                                   ('I', 'S')])
    # we can generate some random data.
    raw_data = np.random.randint(low=0, high=2, size=(1000, 5))
    data = pd.DataFrame(raw_data, columns=['D', 'I', 'G', 'L', 'S'])
    data_train = data[: int(data.shape[0] * 0.75)]
    student_model.fit(data_train)
    student_model.get_cpds()

    data_test = data[int(0.75 * data.shape[0]): data.shape[0]]
    data_test.drop('D', axis=1, inplace=True)
    student_model.predict(data_test)

    grade_cpd = TabularCPD(
        variable='G',
        variable_card=3,
        values=[[0.3, 0.05, 0.9, 0.5],
                [0.4, 0.25, 0.08, 0.3],
                [0.3, 0.7, 0.02, 0.2]],
        evidence=['I', 'D'],
        evidence_card=[2, 2])
    difficulty_cpd = TabularCPD(
        variable='D',
        variable_card=2,
        values=[[0.6, 0.4]])
    intel_cpd = TabularCPD(
        variable='I',
        variable_card=2,
        values=[[0.7, 0.3]])
    letter_cpd = TabularCPD(
        variable='L',
        variable_card=2,
        values=[[0.1, 0.4, 0.99],
                [0.9, 0.6, 0.01]],
        evidence=['G'],
        evidence_card=[3])
    sat_cpd = TabularCPD(
        variable='S',
        variable_card=2,
        values=[[0.95, 0.2],
                [0.05, 0.8]],
        evidence=['I'],
        evidence_card=[2])
    student_model.add_cpds(grade_cpd, difficulty_cpd,
                           intel_cpd, letter_cpd,
                           sat_cpd)
Exemple #2
0
def bayesnet_examples():
    from pgmpy.factors import TabularCPD
    from pgmpy.models import BayesianModel
    import pandas as pd

    student_model = BayesianModel([('D', 'G'),
                                   ('I', 'G'),
                                   ('G', 'L'),
                                   ('I', 'S')])
    # we can generate some random data.
    raw_data = np.random.randint(low=0, high=2, size=(1000, 5))
    data = pd.DataFrame(raw_data, columns=['D', 'I', 'G', 'L', 'S'])
    data_train = data[: int(data.shape[0] * 0.75)]
    student_model.fit(data_train)
    student_model.get_cpds()

    data_test = data[int(0.75 * data.shape[0]): data.shape[0]]
    data_test.drop('D', axis=1, inplace=True)
    student_model.predict(data_test)

    grade_cpd = TabularCPD(
        variable='G',
        variable_card=3,
        values=[[0.3, 0.05, 0.9, 0.5],
                [0.4, 0.25, 0.08, 0.3],
                [0.3, 0.7, 0.02, 0.2]],
        evidence=['I', 'D'],
        evidence_card=[2, 2])
    difficulty_cpd = TabularCPD(
        variable='D',
        variable_card=2,
        values=[[0.6, 0.4]])
    intel_cpd = TabularCPD(
        variable='I',
        variable_card=2,
        values=[[0.7, 0.3]])
    letter_cpd = TabularCPD(
        variable='L',
        variable_card=2,
        values=[[0.1, 0.4, 0.99],
                [0.9, 0.6, 0.01]],
        evidence=['G'],
        evidence_card=[3])
    sat_cpd = TabularCPD(
        variable='S',
        variable_card=2,
        values=[[0.95, 0.2],
                [0.05, 0.8]],
        evidence=['I'],
        evidence_card=[2])
    student_model.add_cpds(grade_cpd, difficulty_cpd,
                           intel_cpd, letter_cpd,
                           sat_cpd)
Exemple #3
0
 def parents_instantiated(self, cid: BayesianModel) -> bool:
     """Checks that all parents have been instantiated, which is a pre-condition for instantiating self"""
     for p in self.evidence:
         p_cpd = cid.get_cpds(p)
         if not (p_cpd and hasattr(p_cpd, 'state_names')):
             return False
     return True
Exemple #4
0
def to_dynamic_cpd(
    static_model: BayesianModel,
    stat_to_dyn_map: typing.Dict[str, str],
    next_to_curr_map: typing.Dict[str, str],
) -> TabularCPD:
    # Lambda to obtain dynamic nodes' name
    get_dynamic_node = (lambda node: (stat_to_dyn_map[node], 0)
                        if node.endswith("_T") else
                        (stat_to_dyn_map[next_to_curr_map[node]], 1))

    # Extract information about CPDs of the static model
    cpds_info = [{
        "variable":
        get_dynamic_node(cpd.variable),
        "variable_card":
        4,
        "values":
        cpd.get_values(),
        "evidence": [(stat_to_dyn_map[e], 0) for e in cpd.get_evidence()][::-1]
        if len(cpd.get_evidence()) > 0 else None,
        "evidence_card":
        [4] * len(cpd.get_evidence()) if len(cpd.get_evidence()) > 0 else None,
        "state_names":
        {get_dynamic_node(k): v
         for k, v in cpd.state_names.items()},
    } for cpd in static_model.get_cpds()]

    return [TabularCPD(**cpd_info) for cpd_info in cpds_info]
Exemple #5
0
def pgmpyToGrid(model: BayesianModel,
                queryNode: Name,
                shorten: bool = True) -> Grid:
    '''
    Renders a list of lists (grid) from the pgmpy model, out of the CPD for the given query node.
    '''
    # Get the dictionary of 'var' : [states]
    allVarStates: Dict[Name,
                       List[State]] = model.get_cpds(queryNode).state_names

    condVarStates: Dict[Name,
                        List[State]] = dict(list(allVarStates.items())[1:])

    # Doing product between states of the evidence (conditional) variables to get: (Dumb, Easy), (Dumb, Hard),
    # (Intelligent, Easy), (Intelligent, Hard) ...
    condStateProducts: List[Tuple[State, State]] = list(
        itertools.product(*list(condVarStates.values())))

    # Transposing the CPDs to get the rows in column format, since this is what the renderTable function expects to use.
    cpdProbabilities: List[np.ndarray] = list(
        model.get_cpds(queryNode).get_values().T)

    # This is basically the gird, with titles next to probabilities but need to format so everything is a list and no
    # other structure is inside:
    tempGrid: Grid = list(zip(condStateProducts, cpdProbabilities))

    grid: Grid = [
        list(nameProduct) + list(probs) for nameProduct, probs in tempGrid
    ]

    if shorten and len(
            grid
    ) > 15:  # extra test to ensure no putting dots when there are fewer than 15 rows
        #MAX_ROWS: int = 15
        BOTTOM_ROWS: int = 5
        TOP_ROWS: int = 10

        # Shortening the grid

        blankRow = ['...' for _ in range(len(grid[0]))]

        grid: Grid = grid[0:TOP_ROWS] + [blankRow
                                         ] + grid[len(grid) - BOTTOM_ROWS:]

    return grid
Exemple #6
0
 def parent_values(self, cid: BayesianModel) -> List[List]:
     """Return a list of lists for the values each parent can take (based on the parent state names)"""
     assert self.parents_instantiated(cid)
     parent_values = []
     for p in self.evidence:
         p_cpd = cid.get_cpds(p)
         if p_cpd and hasattr(p_cpd, 'state_names'):
             parent_values.append(p_cpd.state_names[p])
     return parent_values
Exemple #7
0
def conditionalDistDf(model: BayesianModel,
                      query: RandomVariable) -> DataFrame:
    '''
    Given a query variable, gets its conditional TabularCPD and puts that into a pandas DataFrame
    '''
    # Get the Tabular CPD (learned) from the model:
    queryTCPD: TabularCPD = model.get_cpds(query.var)

    return tabularDf(cpd=queryTCPD)
Exemple #8
0
class BayesNetwork:
    def __init__(self, dataset, graph_structure_index):
        self.dataset = dataset
        self.columns = dataset.dataframe.columns
        self.graph_structure_index = graph_structure_index

    def build_graph(self):
        graph_structure_name = list(
            map(lambda tuple: (self.columns[tuple[0]], self.columns[tuple[1]]),
                self.graph_structure_index))
        self.model = BayesianModel(graph_structure_name)

    def draw_graph(self):
        Drawer.draw_graph(self.model)

    def fit_model(self, prior=False, prior_data=[]):
        if prior:
            pseudo_counts = {{
                'D': [300, 700],
                'I': [500, 500],
                'G': [800, 200],
                'L': [500, 500],
                'S': [400, 600]
            }}
            raise NotImplementedError
        else:
            self.model.fit(self.dataset.dataframe[0:-3],
                           estimator=MaximumLikelihoodEstimator)

    def inference(self, name):
        from pgmpy.inference import VariableElimination
        self.infer = VariableElimination(self.model)
        q = self.infer.query(variables=[name])
        print(q[name])

    def evaluate_result(self):
        for cpd in self.model.get_cpds():
            print("CPD of {variable}:".format(variable=cpd.variable))
            print(cpd)
            accept_node = cpd.variables[0]

            ##3D-dimension
            if len(cpd.values.shape) > 3:
                pass
                # Drawer.draw_3D(cpd.values, x_label=cpd.variables[1],
                #                y_label=cpd.variables[2], z_label=cpd.variables[3])
            ##2D Dimension
            elif len(cpd.values.shape) == 2:
                title = cpd.variables[1] + '----->' + accept_node
                Drawer(title=title,
                       is_show=False,
                       is_save=False,
                       save_path='img/' + title + '.jpg').draw_matrix(
                           cpd.values)
def pgmpy_test():

    raw_data = np.array([0] * 30 +
                        [1] * 70)  # Representing heads by 0 and tails by 1
    data = pd.DataFrame(raw_data, columns=['coin'])
    print(data)
    model = BayesianModel()
    model.add_node('coin')

    # Fitting the data to the model using Maximum Likelihood Estimator
    model.fit(data, estimator=MaximumLikelihoodEstimator)
    print(model.get_cpds('coin'))
def create_network(models, processes, files):
    for p in range(files):
        temp_model = BayesianModel()
        for e in range(len(processes[p].get_errors())):
            temp_error = processes[p].get_error(e)
            for c in range(len(temp_error.get_causes())):
                temp_cause = temp_error.get_cause(c)
                q = temp_cause.get_occ_prob(
                ) / temp_error.get_total_cause_prob()
                temp_cause.set_occ_prob(q)
                temp_model.add_nodes_from([temp_cause, temp_error])
                temp_model.add_edge(temp_cause, temp_error)
                temp_cause_cpd = TabularCPD(variable=temp_cause,
                                            variable_card=2,
                                            values=[[q, 1 - q]])
                temp_model.add_cpds(temp_cause_cpd)
            temp_error_cpd = TabularCPD(
                variable=temp_error,
                variable_card=2,
                values=get_initial_error_cpd(len(temp_error.get_causes())),
                evidence=temp_error.get_causes(),
                evidence_card=[2] * (len(temp_error.get_causes())))
            temp_model.add_cpds(temp_error_cpd)
            for f in range(len(temp_error.get_effects())):
                temp_effect = temp_error.get_effect(f)
                temp_model.add_nodes_from([temp_error, temp_effect])
                temp_model.add_edge(temp_error, temp_effect)
        models.append(temp_model)
        #plotting Failure Tree
        dot = to_pydot(models[p])
        with open('failure_tree_graph_%s.png' % processes[p], 'wb') as f:
            f.write(dot.create_png())
        #Sample output of CPDs for causes and errors
        for e in range(len(processes[p].get_errors())):
            for c in range(len(processes[p].get_error(e).get_causes())):
                print(
                    temp_model.get_cpds(
                        processes[p].get_error(e).get_cause(c)))
            print(temp_model.get_cpds(processes[p].get_error(e)))
class TestBayesianModelFitPredict(unittest.TestCase):
    def setUp(self):
        self.model_disconnected = BayesianModel()
        self.model_disconnected.add_nodes_from(['A', 'B', 'C', 'D', 'E'])

        self.model_connected = BayesianModel([('A', 'B'), ('C', 'B'), ('C', 'D'), ('B', 'E')])

    def test_disconnected_fit(self):
        values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 5)),
                              columns=['A', 'B', 'C', 'D', 'E'])
        self.model_disconnected.fit(values)

        for node in ['A', 'B', 'C', 'D', 'E']:
            cpd = self.model_disconnected.get_cpds(node)
            self.assertEqual(cpd.variable, node)
            np_test.assert_array_equal(cpd.cardinality, np.array([2]))
            value = (values.ix[:, node].value_counts() /
                     values.ix[:, node].value_counts().sum())
            value = value.reindex(sorted(value.index)).values
            np_test.assert_array_equal(cpd.values, value)

    def test_connected_predict(self):
        np.random.seed(42)
        values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 5)),
                              columns=['A', 'B', 'C', 'D', 'E'])
        fit_data = values[:800]
        predict_data = values[800:].copy()
        self.model_connected.fit(fit_data)
        self.assertRaises(ValueError, self.model_connected.predict, predict_data)
        predict_data.drop('E', axis=1, inplace=True)
        e_predict = self.model_connected.predict(predict_data)
        np_test.assert_array_equal(e_predict.values.ravel(),
                                   np.array([1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1,
                                             1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0,
                                             0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0,
                                             0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
                                             0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1,
                                             1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1,
                                             1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0,
                                             1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1,
                                             0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1,
                                             1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1,
                                             1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1,
                                             0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0,
                                             1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1,
                                             1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1,
                                             1, 1, 1, 0]))

    def tearDown(self):
        del self.model_connected
        del self.model_disconnected
def pgmpy_test2():
    # example from https://github.com/pgmpy/pgmpy/blob/dev/examples/Learning%20from%20data.ipynb
    # Generating radom data with each variable have 2 states and equal probabilities for each state

    raw_data = np.random.randint(low=0, high=2, size=(1000, 5))
    data = pd.DataFrame(raw_data, columns=['D', 'I', 'G', 'L', 'S'])

    model = BayesianModel([('D', 'G'), ('I', 'G'), ('I', 'S'), ('G', 'L')])

    # Learing CPDs using Maximum Likelihood Estimators
    model.fit(data, estimator=MaximumLikelihoodEstimator)
    for cpd in model.get_cpds():
        print("CPD of {variable}:".format(variable=cpd.variable))
        print(cpd)
Exemple #13
0
def jointDistribution(model: BayesianModel) -> JointProbabilityDistribution:
    ''' Returns joint prob distribution over entire network'''

    # There is no reason the cpds must be converted to DiscreteFactors ; can access variables, values, cardinality the same way, but this is how the mini-example in API docs does it. (imap() implementation)
    factors: List[DiscreteFactor] = [
        cpd.to_factor() for cpd in model.get_cpds()
    ]
    jointProbFactor: DiscreteFactor = reduce(mul, factors)

    # TODO need to assert that probabilities sum to 1? Always true? or to normalize here?

    return JointProbabilityDistribution(
        variables=jointProbFactor.variables,
        cardinality=jointProbFactor.cardinality,
        values=jointProbFactor.values)
Exemple #14
0
def make_bayes_net(load=False, subtree=True, modelsdir=MODEL_CPDS_DIR):
    print('Making bayes net')
    graph_file = RUNNING_MODEL_DIR + '/' + 'graph.p'
    if os.path.isfile(graph_file) and load == True:
        print('Loading saved graph from file...')
        G = pickle.load(open(graph_file, 'rb'))
        G.check_model()
    else:
        print('loading data...')
        training_labels, go_dict = load_label_data()
        if subtree:
            labels_list = _subtree_labels()
            print(labels_list)
        else:
            labels_list = go_dict.keys()

        print('adding nodes and edges...')
        G = BayesianModel()
        G.add_edges_from([(label, label + '_hat') for label in labels_list])
        obo_graph = obonet.read_obo(OBODB_FILE)
        for label in labels_list:
            children = [
                c for c in networkx.ancestors(obo_graph, label)
                if c in labels_list
            ]
            for child in children:
                G.add_edge(child, label)

        predicted_cpds = get_model_cpds(labels_list=labels_list,
                                        modelsdir=MODEL_CPDS_DIR)
        for cpd in predicted_cpds:
            G.add_cpds(cpd)
        true_label_cpds = get_true_label_cpds(training_labels,
                                              go_dict,
                                              labels_list=labels_list)
        for cpd in true_label_cpds:
            G.add_cpds(cpd)
        remove_list = []
        for node in G.nodes():
            if G.get_cpds(node) == None:
                remove_list.append(node)
                # remove_list.append(node+'_hat')
        for node in remove_list:
            if node in G:
                G.remove_node(node)
        G.check_model()
        pickle.dump(G, open(graph_file, 'wb'))
    return G
Exemple #15
0
 def initialize_tabular_cpd(self, cid: BayesianModel) -> bool:
     """initialize the TabularCPD with a matrix representing a uniform random distribution"""
     parents = cid.get_parents(self.variable)
     # check that parents are initialized
     for parent in parents:
         if not cid.get_cpds(parent):
             return False
     parents_card = [cid.get_cardinality(p) for p in parents]
     transition_matrix = np.ones(
         (self.variable_card,
          np.product(parents_card).astype(int))) / self.variable_card
     super().__init__(self.variable,
                      self.variable_card,
                      transition_matrix,
                      parents,
                      parents_card,
                      state_names=self.state_names)
     return True
Exemple #16
0
def probnet():
    # Defining the model structure. We can define the network by just passing a list of edges.
    model = BayesianModel([('H', 'S'), ('B', 'S'), ('D', 'S')])
    # Defining individual CPDs.
    cpd_h = TabularCPD(variable='H', variable_card=2, values=[[0.2, 0.8]])
    cpd_b = TabularCPD(variable='B', variable_card=2, values=[[0.1, 0.9]])
    cpd_d = TabularCPD(variable='D', variable_card=2, values=[[0.5, 0.5]])
    cpd_s = TabularCPD(variable='S',
                       variable_card=2,
                       values=[[0.1, 0.2, 0.1, 0.15, 0.4, 0.35, 0.45, 0.43],
                               [0.9, 0.8, 0.9, 0.85, 0.6, 0.65, 0.55, 0.57]],
                       evidence=['H', 'B', 'D'],
                       evidence_card=[2, 2, 2])
    # Associating the CPDs with the network
    model.add_cpds(cpd_h, cpd_b, cpd_d, cpd_s)
    # check_model checks for the network structure and CPDs and verifies that the CPDs are correctly
    # defined and sum to 1.
    model.check_model()
    print(model.get_cpds('S'))
    # infer = VariableElimination(model)
    # infer.map_query('S', evidence={'H': 1, 'B': 0, 'D': 1})
    return model
def Bayesian_estimate(data, dependency_structure, graph_edges):
    data.columns = [i + 1 for i in range(data.shape[1])]
    print(data)
    model = BayesianModel(graph_edges)
    model.fit(data, estimator=BayesianEstimator,
              prior_type="BDeu")  # default equivalent_sample_size=5

    for column in data.columns:
        print(column, data[column].unique())
    probs = {}

    for parent, child in dependency_structure:
        cpd = model.get_cpds(node=child)
        print()
        print(cpd)
        print(cpd.variable_card)
        index = [
            cpd.variables.index(var) -
            1 if var > 0 else cpd.variables.index(-1 * var) - 1
            for var in parent
        ]
        ordered_parent = [x for _, x in sorted(zip(index, parent))]
        print(cpd.values)
        if (cpd.variable_card == 1):
            if (data[child].unique()[0] == 0):
                value = 1 - cpd.values[0]
            else:
                value = cpd.values[0]
        else:
            assert cpd.variable_card == 2
            value = cpd.values[1]
        print(value)
        for var in ordered_parent:
            value = value[0] if var < 0 else value[1]
        probs[(parent, child)] = value
        print((parent, child), probs[(parent, child)])

    return probs
class TestBayesianModelMethods(unittest.TestCase):

    def setUp(self):
        self.G = BayesianModel([('a', 'd'), ('b', 'd'),
                                ('d', 'e'), ('b', 'c')])
        self.G1 = BayesianModel([('diff', 'grade'), ('intel', 'grade')])
        diff_cpd = TabularCPD('diff', 2, values=[[0.2], [0.8]])
        intel_cpd = TabularCPD('intel', 3, values=[[0.5], [0.3], [0.2]])
        grade_cpd = TabularCPD('grade', 3, values=[[0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
                                                   [0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
                                                   [0.8, 0.8, 0.8, 0.8, 0.8, 0.8]],
                               evidence=['diff', 'intel'], evidence_card=[2, 3])
        self.G1.add_cpds(diff_cpd, intel_cpd, grade_cpd)
        self.G2 = BayesianModel([('d', 'g'), ('g', 'l'), ('i', 'g'), ('i', 'l')])

    def test_moral_graph(self):
        moral_graph = self.G.moralize()
        self.assertListEqual(sorted(moral_graph.nodes()), ['a', 'b', 'c', 'd', 'e'])
        for edge in moral_graph.edges():
            self.assertTrue(edge in [('a', 'b'), ('a', 'd'), ('b', 'c'), ('d', 'b'), ('e', 'd')] or
                            (edge[1], edge[0]) in [('a', 'b'), ('a', 'd'), ('b', 'c'), ('d', 'b'), ('e', 'd')])

    def test_moral_graph_with_edge_present_over_parents(self):
        G = BayesianModel([('a', 'd'), ('d', 'e'), ('b', 'd'), ('b', 'c'), ('a', 'b')])
        moral_graph = G.moralize()
        self.assertListEqual(sorted(moral_graph.nodes()), ['a', 'b', 'c', 'd', 'e'])
        for edge in moral_graph.edges():
            self.assertTrue(edge in [('a', 'b'), ('c', 'b'), ('d', 'a'), ('d', 'b'), ('d', 'e')] or
                            (edge[1], edge[0]) in [('a', 'b'), ('c', 'b'), ('d', 'a'), ('d', 'b'), ('d', 'e')])

    def test_get_ancestors_of_success(self):
        ancenstors1 = self.G2._get_ancestors_of('g')
        ancenstors2 = self.G2._get_ancestors_of('d')
        ancenstors3 = self.G2._get_ancestors_of(['i', 'l'])
        self.assertEqual(ancenstors1, {'d', 'i', 'g'})
        self.assertEqual(ancenstors2, {'d'})
        self.assertEqual(ancenstors3, {'g', 'i', 'l', 'd'})

    def test_get_ancestors_of_failure(self):
        self.assertRaises(ValueError, self.G2._get_ancestors_of, 'h')

    def test_local_independencies(self):
        self.assertEqual(self.G.local_independencies('a'), Independencies(['a', ['b', 'c']]))
        self.assertEqual(self.G.local_independencies('c'), Independencies(['c', ['a', 'd', 'e'], 'b']))
        self.assertEqual(self.G.local_independencies('d'), Independencies(['d', 'c', ['b', 'a']]))
        self.assertEqual(self.G.local_independencies('e'), Independencies(['e', ['c', 'b', 'a'], 'd']))
        self.assertEqual(self.G.local_independencies('b'), Independencies(['b', 'a']))
        self.assertEqual(self.G1.local_independencies('grade'), Independencies())

    def test_get_independencies(self):
        chain = BayesianModel([('X', 'Y'), ('Y', 'Z')])
        self.assertEqual(chain.get_independencies(), Independencies(('X', 'Z', 'Y'), ('Z', 'X', 'Y')))
        fork = BayesianModel([('Y', 'X'), ('Y', 'Z')])
        self.assertEqual(fork.get_independencies(), Independencies(('X', 'Z', 'Y'), ('Z', 'X', 'Y')))
        collider = BayesianModel([('X', 'Y'), ('Z', 'Y')])
        self.assertEqual(collider.get_independencies(), Independencies(('X', 'Z'), ('Z', 'X')))

    def test_is_imap(self):
        val = [0.01, 0.01, 0.08, 0.006, 0.006, 0.048, 0.004, 0.004, 0.032,
               0.04, 0.04, 0.32, 0.024, 0.024, 0.192, 0.016, 0.016, 0.128]
        JPD = JointProbabilityDistribution(['diff', 'intel', 'grade'], [2, 3, 3], val)
        fac = DiscreteFactor(['diff', 'intel', 'grade'], [2, 3, 3], val)
        self.assertTrue(self.G1.is_imap(JPD))
        self.assertRaises(TypeError, self.G1.is_imap, fac)

    def test_get_immoralities(self):
        G = BayesianModel([('x', 'y'), ('z', 'y'), ('x', 'z'), ('w', 'y')])
        self.assertEqual(G.get_immoralities(), {('w', 'x'), ('w', 'z')})
        G1 = BayesianModel([('x', 'y'), ('z', 'y'), ('z', 'x'), ('w', 'y')])
        self.assertEqual(G1.get_immoralities(), {('w', 'x'), ('w', 'z')})
        G2 = BayesianModel([('x', 'y'), ('z', 'y'), ('x', 'z'), ('w', 'y'), ('w', 'x')])
        self.assertEqual(G2.get_immoralities(), {('w', 'z')})

    def test_is_iequivalent(self):
        G = BayesianModel([('x', 'y'), ('z', 'y'), ('x', 'z'), ('w', 'y')])
        self.assertRaises(TypeError, G.is_iequivalent, MarkovModel())
        G1 = BayesianModel([('V', 'W'), ('W', 'X'), ('X', 'Y'), ('Z', 'Y')])
        G2 = BayesianModel([('W', 'V'), ('X', 'W'), ('X', 'Y'), ('Z', 'Y')])
        self.assertTrue(G1.is_iequivalent(G2))
        G3 = BayesianModel([('W', 'V'), ('W', 'X'), ('Y', 'X'), ('Z', 'Y')])
        self.assertFalse(G3.is_iequivalent(G2))

    def test_copy(self):
        model_copy = self.G1.copy()
        self.assertEqual(sorted(self.G1.nodes()), sorted(model_copy.nodes()))
        self.assertEqual(sorted(self.G1.edges()), sorted(model_copy.edges()))
        self.assertNotEqual(id(self.G1.get_cpds('diff')),
                            id(model_copy.get_cpds('diff')))

        self.G1.remove_cpds('diff')
        diff_cpd = TabularCPD('diff', 2, values=[[0.3], [0.7]])
        self.G1.add_cpds(diff_cpd)
        self.assertNotEqual(self.G1.get_cpds('diff'),
                            model_copy.get_cpds('diff'))

        self.G1.remove_node('intel')
        self.assertNotEqual(sorted(self.G1.nodes()), sorted(model_copy.nodes()))
        self.assertNotEqual(sorted(self.G1.edges()), sorted(model_copy.edges()))

    def test_remove_node(self):
        self.G1.remove_node('diff')
        self.assertEqual(sorted(self.G1.nodes()), sorted(['grade', 'intel']))
        self.assertRaises(ValueError, self.G1.get_cpds, 'diff')

    def test_remove_nodes_from(self):
        self.G1.remove_nodes_from(['diff', 'grade'])
        self.assertEqual(sorted(self.G1.nodes()), sorted(['intel']))
        self.assertRaises(ValueError, self.G1.get_cpds, 'diff')
        self.assertRaises(ValueError, self.G1.get_cpds, 'grade')

    def tearDown(self):
        del self.G
        del self.G1
        ax_temp.bar(x, z, zs=y, zdir='y', alpha=0.6, color='r' * 4)
        ax_temp.set_xlabel('X')
        ax_temp.set_ylabel('Y')
        ax_temp.set_zlabel('Z')
        ax_temp.title.set_text(('Feature ' + str(mean_indices[counter])))
        counter += 1
plt.show()

# Learning naive bayes model from various subsets of data
naive_bayes_with_some_features(all_city_data, all_city_label, feature_list=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
naive_bayes_with_some_features(all_city_data, all_city_label, feature_list=[0, 1, 2])
naive_bayes_with_some_features(all_city_data, all_city_label, feature_list=[0, 1, 2, 4])
naive_bayes_with_some_features(all_city_data, all_city_label, feature_list=[0, 1, 2, 3, 4, 5])

# Splitting train and test data for PGM model
temp_data = pd.concat([all_city_data, pd.DataFrame(all_city_label, columns=[13])], axis=1)
pgm_train_set = temp_data.loc[0:700]
pgm_test_set = temp_data.loc[700:]
print(pgm_train_set)


# Implementing PGM model on data
# Using these features: 0: (age) 1: (sex) 2: (cp)
pgm_model = BayesianModel()
pgm_model.add_nodes_from([0, 1, 2, 13])
pgm_model.add_edges_from([(1, 13)])
pgm_model.fit(pgm_train_set.loc[:, [0, 1, 2, 13]])
pgm_test_set = pgm_test_set.loc[:, [0, 1, 2, 13]].drop(13, axis=1)
print(pgm_test_set)
print(pgm_model.get_cpds(13))
Exemple #20
0
def bayesnet():
    """
    References:
        https://class.coursera.org/pgm-003/lecture/17
        http://www.cs.ubc.ca/~murphyk/Bayes/bnintro.html
        http://www3.cs.stonybrook.edu/~sael/teaching/cse537/Slides/chapter14d_BP.pdf
        http://www.cse.unsw.edu.au/~cs9417ml/Bayes/Pages/PearlPropagation.html
        https://github.com/pgmpy/pgmpy.git
        http://pgmpy.readthedocs.org/en/latest/
        http://nipy.bic.berkeley.edu:5000/download/11
    """
    # import operator as op
    # # Enumerate all possible events
    # varcard_list = list(map(op.attrgetter('variable_card'), cpd_list))
    # _esdat = list(ut.iprod(*map(range, varcard_list)))
    # _escol = list(map(op.attrgetter('variable'), cpd_list))
    # event_space = pd.DataFrame(_esdat, columns=_escol)

    # # Custom compression of event space to inspect a specific graph
    # def compress_space_flags(event_space, var1, var2, var3, cmp12_):
    #     """
    #     var1, var2, cmp_ = 'Lj', 'Lk', op.eq
    #     """
    #     import vtool as vt
    #     data = event_space
    #     other_cols = ut.setdiff_ordered(data.columns.tolist(), [var1, var2, var3])
    #     case_flags12 = cmp12_(data[var1], data[var2]).values
    #     # case_flags23 = cmp23_(data[var2], data[var3]).values
    #     # case_flags = np.logical_and(case_flags12, case_flags23)
    #     case_flags = case_flags12
    #     case_flags = case_flags.astype(np.int64)
    #     subspace = np.hstack((case_flags[:, None], data[other_cols].values))
    #     sel_ = vt.unique_row_indexes(subspace)
    #     flags = np.logical_and(mask, case_flags)
    #     return flags

    # # Build special cases
    # case_same   = event_space.loc[compress_space_flags(event_space, 'Li', 'Lj', 'Lk', op.eq)]
    # case_diff = event_space.loc[compress_space_flags(event_space, 'Li', 'Lj', 'Lk', op.ne)]
    # special_cases = [
    #     case_same,
    #     case_diff,
    # ]

    from pgmpy.factors import TabularCPD
    from pgmpy.models import BayesianModel
    import pandas as pd
    from pgmpy.inference import BeliefPropagation  # NOQA
    from pgmpy.inference import VariableElimination  # NOQA

    name_nice = ['n1', 'n2', 'n3']
    score_nice = ['low', 'high']
    match_nice = ['diff', 'same']
    num_names = len(name_nice)
    num_scores = len(score_nice)
    nid_basis = list(range(num_names))
    score_basis = list(range(num_scores))

    semtype2_nice = {
        'score': score_nice,
        'name': name_nice,
        'match': match_nice,
    }
    var2_cpd = {
    }
    globals()['semtype2_nice'] = semtype2_nice
    globals()['var2_cpd'] = var2_cpd

    name_combo = np.array(list(ut.iprod(nid_basis, nid_basis)))
    combo_is_same = name_combo.T[0] == name_combo.T[1]
    def get_expected_scores_prob(level1, level2):
        part1 = combo_is_same * level1
        part2 = (1 - combo_is_same) * (1 - (level2))
        expected_scores_level = part1 + part2
        return expected_scores_level

    # def make_cpd():

    def name_cpd(aid):
        from pgmpy.factors import TabularCPD
        cpd = TabularCPD(
            variable='N' + aid,
            variable_card=num_names,
            values=[[1.0 / num_names] * num_names])
        cpd.semtype = 'name'
        return cpd

    name_cpds = [name_cpd('i'), name_cpd('j'), name_cpd('k')]
    var2_cpd.update(dict(zip([cpd.variable for cpd in name_cpds], name_cpds)))
    if True:
        num_same_diff = 2
        samediff_measure = np.array([
            # get_expected_scores_prob(.12, .2),
            # get_expected_scores_prob(.88, .8),
            get_expected_scores_prob(0, 0),
            get_expected_scores_prob(1, 1),
        ])
        samediff_vals = (samediff_measure / samediff_measure.sum(axis=0)).tolist()
        def samediff_cpd(aid1, aid2):
            cpd = TabularCPD(
                variable='A' + aid1 + aid2,
                variable_card=num_same_diff,
                values=samediff_vals,
                evidence=['N' + aid1, 'N' + aid2],  # [::-1],
                evidence_card=[num_names, num_names])  # [::-1])
            cpd.semtype = 'match'
            return cpd
        samediff_cpds = [samediff_cpd('i', 'j'), samediff_cpd('j', 'k'), samediff_cpd('k', 'i')]
        var2_cpd.update(dict(zip([cpd.variable for cpd in samediff_cpds], samediff_cpds)))

        if True:
            def score_cpd(aid1, aid2):
                semtype = 'score'
                evidence = ['A' + aid1 + aid2, 'N' + aid1, 'N' + aid2]
                evidence_cpds = [var2_cpd[key] for key in evidence]
                evidence_nice = [semtype2_nice[cpd.semtype] for cpd in evidence_cpds]
                evidence_card = list(map(len, evidence_nice))
                evidence_states = list(ut.iprod(*evidence_nice))
                variable_basis = semtype2_nice[semtype]

                variable_values = []
                for mystate in variable_basis:
                    row = []
                    for state in evidence_states:
                        if state[0] == state[1]:
                            if state[2] == 'same':
                                val = .2 if mystate == 'low' else .8
                            else:
                                val = 1
                                # val = .5 if mystate == 'low' else .5
                        elif state[0] != state[1]:
                            if state[2] == 'same':
                                val = .5 if mystate == 'low' else .5
                            else:
                                val = 1
                                # val = .9 if mystate == 'low' else .1
                        row.append(val)
                    variable_values.append(row)

                cpd = TabularCPD(
                    variable='S' + aid1 + aid2,
                    variable_card=len(variable_basis),
                    values=variable_values,
                    evidence=evidence,  # [::-1],
                    evidence_card=evidence_card)  # [::-1])
                cpd.semtype = semtype
                return cpd
        else:
            score_values = [
                [.8, .1],
                [.2, .9],
            ]
            def score_cpd(aid1, aid2):
                cpd = TabularCPD(
                    variable='S' + aid1 + aid2,
                    variable_card=num_scores,
                    values=score_values,
                    evidence=['A' + aid1 + aid2],  # [::-1],
                    evidence_card=[num_same_diff])  # [::-1])
                cpd.semtype = 'score'
                return cpd

        score_cpds = [score_cpd('i', 'j'), score_cpd('j', 'k')]
        cpd_list = name_cpds + score_cpds + samediff_cpds
    else:
        score_measure = np.array([get_expected_scores_prob(level1, level2)
                                  for level1, level2 in
                                  zip(np.linspace(.1, .9, num_scores),
                                      np.linspace(.2, .8, num_scores))])

        score_values = (score_measure / score_measure.sum(axis=0)).tolist()

        def score_cpd(aid1, aid2):
            cpd = TabularCPD(
                variable='S' + aid1 + aid2,
                variable_card=num_scores,
                values=score_values,
                evidence=['N' + aid1, 'N' + aid2],
                evidence_card=[num_names, num_names])
            cpd.semtype = 'score'
            return cpd
        score_cpds = [score_cpd('i', 'j'), score_cpd('j', 'k')]
        cpd_list = name_cpds + score_cpds
        pass

    input_graph = []
    for cpd in cpd_list:
        if cpd.evidence is not None:
            for evar in cpd.evidence:
                input_graph.append((evar, cpd.variable))
    name_model = BayesianModel(input_graph)
    name_model.add_cpds(*cpd_list)

    var2_cpd.update(dict(zip([cpd.variable for cpd in cpd_list], cpd_list)))
    globals()['var2_cpd'] = var2_cpd

    varnames = [cpd.variable for cpd in cpd_list]

    # --- PRINT CPDS ---

    cpd = score_cpds[0]
    def print_cpd(cpd):
        print('CPT: %r' % (cpd,))
        index = semtype2_nice[cpd.semtype]
        if cpd.evidence is None:
            columns = ['None']
        else:
            basis_lists = [semtype2_nice[var2_cpd[ename].semtype] for ename in cpd.evidence]
            columns = [','.join(x) for x in ut.iprod(*basis_lists)]
        data = cpd.get_cpd()
        print(pd.DataFrame(data, index=index, columns=columns))

    for cpd in name_model.get_cpds():
        print('----')
        print(cpd._str('phi'))
        print_cpd(cpd)

    # --- INFERENCE ---

    Ni = name_cpds[0]

    event_space_combos = {}
    event_space_combos[Ni.variable] = 0  # Set ni to always be Fred
    for cpd in cpd_list:
        if cpd.semtype == 'score':
            event_space_combos[cpd.variable] = list(range(cpd.variable_card))
    evidence_dict = ut.all_dict_combinations(event_space_combos)

    # Query about name of annotation k given different event space params

    def pretty_evidence(evidence):
        return [key + '=' + str(semtype2_nice[var2_cpd[key].semtype][val])
                for key, val in evidence.items()]

    def print_factor(factor):
        row_cards = factor.cardinality
        row_vars = factor.variables
        values = factor.values.reshape(np.prod(row_cards), 1).flatten()
        # col_cards = 1
        # col_vars = ['']
        basis_lists = list(zip(*list(ut.iprod(*[range(c) for c in row_cards]))))
        nice_basis_lists = []
        for varname, basis in zip(row_vars, basis_lists):
            cpd = var2_cpd[varname]
            _nice_basis = ut.take(semtype2_nice[cpd.semtype], basis)
            nice_basis = ['%s=%s' % (varname, val) for val in _nice_basis]
            nice_basis_lists.append(nice_basis)
        row_lbls = [', '.join(sorted(x)) for x in zip(*nice_basis_lists)]
        print(ut.repr3(dict(zip(row_lbls, values)), precision=3, align=True, key_order_metric='-val'))

    # name_belief = BeliefPropagation(name_model)
    name_belief = VariableElimination(name_model)
    import pgmpy
    import six  # NOQA

    def try_query(evidence):
        print('--------')
        query_vars = ut.setdiff_ordered(varnames, list(evidence.keys()))
        evidence_str = ', '.join(pretty_evidence(evidence))
        probs = name_belief.query(query_vars, evidence)
        factor_list = probs.values()
        joint_factor = pgmpy.factors.factor_product(*factor_list)
        print('P(' + ', '.join(query_vars) + ' | ' + evidence_str + ')')
        # print(six.text_type(joint_factor))
        factor = joint_factor  # NOQA
        # print_factor(factor)
        # import utool as ut
        print(ut.hz_str([(f._str(phi_or_p='phi')) for f in factor_list]))

    for evidence in evidence_dict:
        try_query(evidence)

    evidence = {'Aij': 1, 'Ajk': 1, 'Aki': 1, 'Ni': 0}
    try_query(evidence)

    evidence = {'Aij': 0, 'Ajk': 0, 'Aki': 0, 'Ni': 0}
    try_query(evidence)

    globals()['score_nice'] = score_nice
    globals()['name_nice'] = name_nice
    globals()['score_basis'] = score_basis
    globals()['nid_basis'] = nid_basis

    print('Independencies')
    print(name_model.get_independencies())
    print(name_model.local_independencies([Ni.variable]))

    # name_belief = BeliefPropagation(name_model)
    # # name_belief = VariableElimination(name_model)
    # for case in special_cases:
    #     test_data = case.drop('Lk', axis=1)
    #     test_data = test_data.reset_index(drop=True)
    #     print('----')
    #     for i in range(test_data.shape[0]):
    #         evidence = test_data.loc[i].to_dict()
    #         probs = name_belief.query(['Lk'], evidence)
    #         factor = probs['Lk']
    #         probs = factor.values
    #         evidence_ = evidence.copy()
    #         evidence_['Li'] = name_nice[evidence['Li']]
    #         evidence_['Lj'] = name_nice[evidence['Lj']]
    #         evidence_['Sij'] = score_nice[evidence['Sij']]
    #         evidence_['Sjk'] = score_nice[evidence['Sjk']]
    #         nice2_prob = ut.odict(zip(name_nice, probs.tolist()))
    #         ut.print_python_code('P(Lk | {evidence}) = {cpt}'.format(
    #             evidence=(ut.repr2(evidence_, explicit=True, nobraces=True, strvals=True)),
    #             cpt=ut.repr3(nice2_prob, precision=3, align=True, key_order_metric='-val')
    #         ))

    # for case in special_cases:
    #     test_data = case.drop('Lk', axis=1)
    #     test_data = test_data.drop('Lj', axis=1)
    #     test_data = test_data.reset_index(drop=True)
    #     print('----')
    #     for i in range(test_data.shape[0]):
    #         evidence = test_data.loc[i].to_dict()
    #         query_vars = ['Lk', 'Lj']
    #         probs = name_belief.query(query_vars, evidence)
    #         for queryvar in query_vars:
    #             factor = probs[queryvar]
    #             print(factor._str('phi'))
    #             probs = factor.values
    #             evidence_ = evidence.copy()
    #             evidence_['Li'] = name_nice[evidence['Li']]
    #             evidence_['Sij'] = score_nice[evidence['Sij']]
    #             evidence_['Sjk'] = score_nice[evidence['Sjk']]
    #             nice2_prob = ut.odict(zip([queryvar + '=' + x for x in name_nice], probs.tolist()))
    #             ut.print_python_code('P({queryvar} | {evidence}) = {cpt}'.format(
    #                 query_var=query_var,
    #                 evidence=(ut.repr2(evidence_, explicit=True, nobraces=True, strvals=True)),
    #                 cpt=ut.repr3(nice2_prob, precision=3, align=True, key_order_metric='-val')
    #             ))

    # _ draw model

    import plottool as pt
    import networkx as netx
    fig = pt.figure()  # NOQA
    fig.clf()
    ax = pt.gca()

    netx_nodes = [(node, {}) for node in name_model.nodes()]
    netx_edges = [(etup[0], etup[1], {}) for etup in name_model.edges()]
    netx_graph = netx.DiGraph()
    netx_graph.add_nodes_from(netx_nodes)
    netx_graph.add_edges_from(netx_edges)

    # pos = netx.graphviz_layout(netx_graph)
    pos = netx.pydot_layout(netx_graph, prog='dot')
    netx.draw(netx_graph, pos=pos, ax=ax, with_labels=True)

    pt.plt.savefig('foo.png')
    ut.startfile('foo.png')
Exemple #21
0
class TestInferenceBase(unittest.TestCase):
    def setUp(self):
        self.bayesian = BayesianModel([('a', 'b'), ('b', 'c'), ('c', 'd'), ('d', 'e')])
        a_cpd = TabularCPD('a', 2, [[0.4, 0.6]])
        b_cpd = TabularCPD('b', 2, [[0.2, 0.4], [0.3, 0.4]], evidence='a',
                           evidence_card=[2])
        c_cpd = TabularCPD('c', 2, [[0.1, 0.2], [0.3, 0.4]], evidence='b',
                           evidence_card=[2])
        d_cpd = TabularCPD('d', 2, [[0.4, 0.3], [0.2, 0.1]], evidence='c',
                           evidence_card=[2])
        e_cpd = TabularCPD('e', 2, [[0.3, 0.2], [0.4, 0.1]], evidence='d',
                           evidence_card=[2])
        self.bayesian.add_cpds(a_cpd, b_cpd, c_cpd, d_cpd, e_cpd)

        self.markov = MarkovModel([('a', 'b'), ('b', 'd'), ('a', 'c'), ('c', 'd')])
        factor_1 = Factor(['a', 'b'], [2, 2], np.array([100, 1, 1, 100]))
        factor_2 = Factor(['a', 'c'], [2, 2], np.array([40, 30, 100, 20]))
        factor_3 = Factor(['b', 'd'], [2, 2], np.array([1, 100, 100, 1]))
        factor_4 = Factor(['c', 'd'], [2, 2], np.array([60, 60, 40, 40]))
        self.markov.add_factors(factor_1, factor_2, factor_3, factor_4)

    def test_bayesian_inference_init(self):
        infer_bayesian = Inference(self.bayesian)
        self.assertEqual(set(infer_bayesian.variables), {'a', 'b', 'c', 'd', 'e'})
        self.assertEqual(infer_bayesian.cardinality, {'a': 2, 'b': 2, 'c': 2,
                                                      'd': 2, 'e': 2})
        self.assertIsInstance(infer_bayesian.factors, defaultdict)
        self.assertEqual(set(infer_bayesian.factors['a']),
                         set([self.bayesian.get_cpds('a').to_factor(),
                              self.bayesian.get_cpds('b').to_factor()]))
        self.assertEqual(set(infer_bayesian.factors['b']),
                         set([self.bayesian.get_cpds('b').to_factor(),
                              self.bayesian.get_cpds('c').to_factor()]))
        self.assertEqual(set(infer_bayesian.factors['c']),
                         set([self.bayesian.get_cpds('c').to_factor(),
                              self.bayesian.get_cpds('d').to_factor()]))
        self.assertEqual(set(infer_bayesian.factors['d']),
                         set([self.bayesian.get_cpds('d').to_factor(),
                              self.bayesian.get_cpds('e').to_factor()]))
        self.assertEqual(set(infer_bayesian.factors['e']),
                         set([self.bayesian.get_cpds('e').to_factor()]))

    def test_markov_inference_init(self):
        infer_markov = Inference(self.markov)
        self.assertEqual(set(infer_markov.variables), {'a', 'b', 'c', 'd'})
        self.assertEqual(infer_markov.cardinality, {'a': 2, 'b': 2, 'c': 2, 'd': 2})
        self.assertEqual(infer_markov.factors, {'a': [Factor(['a', 'b'], [2, 2],
                                                             np.array([100, 1, 1, 100])),
                                                      Factor(['a', 'c'], [2, 2],
                                                             np.array([40, 30, 100, 20]))],
                                                'b': [Factor(['a', 'b'], [2, 2],
                                                             np.array([100, 1, 1, 100])),
                                                      Factor(['b', 'd'], [2, 2],
                                                             np.array([1, 100, 100, 1]))],
                                                'c': [Factor(['a', 'c'], [2, 2],
                                                             np.array([40, 30, 100, 20])),
                                                      Factor(['c', 'd'], [2, 2],
                                                             np.array([60, 60, 40, 40]))],
                                                'd': [Factor(['b', 'd'], [2, 2],
                                                             np.array([1, 100, 100, 1])),
                                                      Factor(['c', 'd'], [2, 2],
                                                             np.array([60, 60, 40, 40]))]})
Exemple #22
0
        [0.3, 0.05, 0.9, 0.5],  # 该节点的概率表
        [0.4, 0.25, 0.08, 0.3],
        [0.3, 0.7, 0.02, 0.2]
    ],
    evidence=["I", "D"],  # 该节点的依赖节点
    evidence_card=[2, 2]  # 依赖节点的取值个数
)
drug_cpd = TabularCPD(variable="L",
                      variable_card=2,
                      values=[[0.1, 0.4, 0.99], [0.9, 0.6, 0.01]],
                      evidence=["G"],
                      evidence_card=[3])
toxicity_cpd = TabularCPD(variable="S",
                          variable_card=2,
                          values=[[0.95, 0.2], [0.05, 0.8]],
                          evidence=["I"],
                          evidence_card=[2])

Chemoinformatics_model.add_cpds(active_cpd, amino_cpd, benzene_cpd, drug_cpd,
                                toxicity_cpd)

Chemoinformatics_model.get_cpds()

Chemoinformatics_infer = VariableElimination(Chemoinformatics_model)
prob = Chemoinformatics_infer.query(variables=["L"],
                                    evidence={
                                        "D": 1,
                                        "I": 1,
                                        "G": 0
                                    })
print(prob)
class TestDirectedGraphCPDOperations(unittest.TestCase):
    def setUp(self):
        self.graph = BayesianModel()

    def test_add_single_cpd(self):
        cpd = TabularCPD('grade', 2, np.random.rand(2, 4),
                         ['diff', 'intel'], [2, 2])
        self.graph.add_edges_from([('diff', 'grade'), ('intel', 'grade')])
        self.graph.add_cpds(cpd)
        self.assertListEqual(self.graph.get_cpds(), [cpd])

    def test_add_multiple_cpds(self):
        cpd1 = TabularCPD('diff', 2, np.random.rand(2, 1))
        cpd2 = TabularCPD('intel', 2, np.random.rand(2, 1))
        cpd3 = TabularCPD('grade', 2, np.random.rand(2, 4),
                          ['diff', 'intel'], [2, 2])
        self.graph.add_edges_from([('diff', 'grade'), ('intel', 'grade')])
        self.graph.add_cpds(cpd1, cpd2, cpd3)
        self.assertListEqual(self.graph.get_cpds(), [cpd1, cpd2, cpd3])

    def test_remove_single_cpd(self):
        cpd1 = TabularCPD('diff', 2, np.random.rand(2, 1))
        cpd2 = TabularCPD('intel', 2, np.random.rand(2, 1))
        cpd3 = TabularCPD('grade', 2, np.random.rand(2, 4),
                          ['diff', 'intel'], [2, 2])
        self.graph.add_edges_from([('diff', 'grade'), ('intel', 'grade')])
        self.graph.add_cpds(cpd1, cpd2, cpd3)
        self.graph.remove_cpds(cpd1)
        self.assertListEqual(self.graph.get_cpds(), [cpd2, cpd3])

    def test_remove_multiple_cpds(self):
        cpd1 = TabularCPD('diff', 2, np.random.rand(2, 1))
        cpd2 = TabularCPD('intel', 2, np.random.rand(2, 1))
        cpd3 = TabularCPD('grade', 2, np.random.rand(2, 4),
                          ['diff', 'intel'], [2, 2])
        self.graph.add_edges_from([('diff', 'grade'), ('intel', 'grade')])
        self.graph.add_cpds(cpd1, cpd2, cpd3)
        self.graph.remove_cpds(cpd1, cpd3)
        self.assertListEqual(self.graph.get_cpds(), [cpd2])

    def test_remove_single_cpd_string(self):
        cpd1 = TabularCPD('diff', 2, np.random.rand(2, 1))
        cpd2 = TabularCPD('intel', 2, np.random.rand(2, 1))
        cpd3 = TabularCPD('grade', 2, np.random.rand(2, 4),
                          ['diff', 'intel'], [2, 2])
        self.graph.add_edges_from([('diff', 'grade'), ('intel', 'grade')])
        self.graph.add_cpds(cpd1, cpd2, cpd3)
        self.graph.remove_cpds('diff')
        self.assertListEqual(self.graph.get_cpds(), [cpd2, cpd3])

    def test_remove_multiple_cpds_string(self):
        cpd1 = TabularCPD('diff', 2, np.random.rand(2, 1))
        cpd2 = TabularCPD('intel', 2, np.random.rand(2, 1))
        cpd3 = TabularCPD('grade', 2, np.random.rand(2, 4),
                          ['diff', 'intel'], [2, 2])
        self.graph.add_edges_from([('diff', 'grade'), ('intel', 'grade')])
        self.graph.add_cpds(cpd1, cpd2, cpd3)
        self.graph.remove_cpds('diff', 'grade')
        self.assertListEqual(self.graph.get_cpds(), [cpd2])

    def test_get_cpd_for_node(self):
        cpd1 = TabularCPD('diff', 2, np.random.rand(2, 1))
        cpd2 = TabularCPD('intel', 2, np.random.rand(2, 1))
        cpd3 = TabularCPD('grade', 2, np.random.rand(2, 4),
                          ['diff', 'intel'], [2, 2])
        self.graph.add_edges_from([('diff', 'grade'), ('intel', 'grade')])
        self.graph.add_cpds(cpd1, cpd2, cpd3)
        self.assertEqual(self.graph.get_cpds('diff'), cpd1)
        self.assertEqual(self.graph.get_cpds('intel'), cpd2)
        self.assertEqual(self.graph.get_cpds('grade'), cpd3)

    def test_get_cpd_raises_error(self):
        cpd1 = TabularCPD('diff', 2, np.random.rand(2, 1))
        cpd2 = TabularCPD('intel', 2, np.random.rand(2, 1))
        cpd3 = TabularCPD('grade', 2, np.random.rand(2, 4),
                          ['diff', 'intel'], [2, 2])
        self.graph.add_edges_from([('diff', 'grade'), ('intel', 'grade')])
        self.graph.add_cpds(cpd1, cpd2, cpd3)
        self.assertRaises(ValueError, self.graph.get_cpds, 'sat')

    def tearDown(self):
        del self.graph
Exemple #24
0
class AwareEnv(object):
    def __init__(self):
        self.actions = [
            0.0, 0.01, 0.02, 0.03, 0.04, 0.05, -0.01, -0.02, -0.03, -0.04,
            -0.05
        ]
        self.model = BayesianModel([('Consciente', 'DistracaoApp'),
                                    ('Consciente', 'DirecaoCarro'),
                                    ('Consciente', 'SomCarro'),
                                    ('Consciente', 'Percepcao')])
        self.episodes = TEST.copy().drop('Consciente', axis=1)

    def reset(self):
        self.model = BayesianModel([('Consciente', 'DistracaoApp'),
                                    ('Consciente', 'DirecaoCarro'),
                                    ('Consciente', 'SomCarro'),
                                    ('Consciente', 'Percepcao')])
        self.model.fit(TRAIN, estimator=BayesianEstimator)
        aware = [
            node for node in self.model.get_cpds()
            if node.variable == 'Consciente'
        ].pop()
        self.state = [np.round(aware.values, 2)]

        self.cpds = self._tabular_cpds_to_dict(self.model)

        for node in self.model.get_cpds():
            print(node)

        return self.state

    def render(self):
        aware = [
            node for node in self.model.get_cpds()
            if node.variable == 'Consciente'
        ].pop()
        self.state = np.round(aware.values, 2)

        self.cpds = self._tabular_cpds_to_dict(self.model)

    def _tabular_cpds_to_dict(self, model):
        return {
            node.variable: {
                state: value
                for state, value in zip(node.state_names[node.variable],
                                        node.values)
            }
            for node in model.get_cpds()
        }

    def _get_cpd_values(self, node_values):
        cpds = []

        for state, param in node_values.items():
            if type(param) == dict:
                cpds.append(list(param.values()))
            else:
                cpds.append(param)

        return np.array(cpds)

    def step(self, adjustment, episode):
        print('######## Ajustes ########')
        print(adjustment)
        print('######## Episódio atual ########')
        print(episode)

        bp = BeliefPropagation(self.model)
        replaced_episode = {k: replacer[k][v] for k, v in episode.iteritems()}

        upper_bound = self.state[0] + adjustment
        lower_bound = self.state[1] - adjustment

        if not (upper_bound > 1 or upper_bound < 0):
            state_aware = [upper_bound, lower_bound]

            cpds = self._tabular_cpds_to_dict(self.model)
            adjustments = self.fit_probabilities(cpds, adjustment)
            for node in self.model.get_cpds():
                if node.variable != 'Consciente':
                    node.values = self._get_cpd_values(
                        adjustments[node.variable])
                    node.normalize()
                else:
                    node.values = np.array(state_aware)

            for node in self.model.get_cpds():
                print(node)
        else:
            state_aware = [self.state]

        print('######## Consciente ########')
        bp = BeliefPropagation(self.model)
        print(
            bp.query(['Consciente'], evidence=replaced_episode)['Consciente'])

        reward = float(input('Recompensa entre -1 e 1: '))
        next_state = []
        next_state.append(np.round(state_aware, 2))
        next_state.extend(list(replaced_episode.values()))

        return next_state, reward

    def fit_probabilities(self, cpds, adjustment):
        del cpds['Consciente']

        adjusted_probabilities = {}
        position = int(adjustment < 0)

        for state, param in cpds.items():
            params = list(param.keys())
            param_values = list(param.values())

            new_param_values = []
            npt = np.transpose(param_values)

            for cpd_list, param in zip(npt, params):
                fitting = approximate[state][param] * (adjustment * 100)

                values = []
                for cpd in cpd_list:
                    fit = cpd + fitting

                    if fit < 0:
                        fit = 0
                    elif fit > 1:
                        fit = 1

                    values.append(fit)

                new_param_values.append(self.normalize(values))

            npt = np.transpose(new_param_values)
            adjusted_probabilities[state] = {}

            for i, param in enumerate(params):
                adjusted_probabilities[state][param] = np.array(npt[i])

        return adjusted_probabilities

    def normalize(self, lst):
        s = sum(lst)
        return list(map(lambda x: float(x) / s, lst))
import numpy as np
import pandas as pd
from pgmpy.models import BayesianModel
from pgmpy.estimators import BayesianEstimator
# Generating some random data
raw_data = np.random.randint(low=0, high=2, size=(1000, 6))
print(raw_data)
data = pd.DataFrame(raw_data, columns=['A', 'R', 'J', 'G', 'L', 'Q'])

# Creating the network structures
student_model = BayesianModel([('A', 'J'), ('R', 'J'),
                               ('J', 'Q'), ('J', 'L'),
                               ('G', 'L')])
student_model.fit(data, estimator=BayesianEstimator)
student_model.get_cpds()
print(student_model.get_cpds('D'))
Exemple #26
0
    for a, b in edges:
        dot.edge(a, b)
    if save:
        dot.view(cleanup=True)
    return dot

predict_data=test.drop(columns=['scene'],axis='1')
# re=pd.read_csv('./re.txt')
# print(re.info())
# print(predict_data.info())
print("预测数据集")
print(predict_data)
y_pred = model.predict(predict_data)
showBN(model)
print("预测结果")
print(y_pred)
# 预测结果

print("节点条件概率情况")
print(model.get_cpds())
# 各个节点条件概率情况
# re['doors'] = re['doors'].astype('object')

# print(model.predict_probability(re))
# 预测概率
print("预测准确率")
print((y_pred['scene']==test['scene']).sum()/len(test))
end=time.process_time()
print("总运行时间:")
print('Running time: %s Seconds'%(end-start))
# 准确率
                          values=[[0.900, 0.200], [0.100, 0.800]],
                          evidence=['HO'],
                          evidence_card=[2])
cpd_posxray = TabularCPD(variable='PX',
                          variable_card=2,
                          values=[[0.990, 0.020], [0.010, 0.980]],
                          evidence=['BT'],
                          evidence_card=[2])
cpd_headache = TabularCPD(variable='HA',
                        variable_card=2,
                        values=[[0.980, 0.100, 0.300, 0.010], [0.020, 0.900, 0.700, 0.990] ],
                        evidence=['HO', 'BT'],
                        evidence_card=[2, 2])

cancer_model.add_cpds(cpd_party, cpd_braintumor, cpd_hangover, cpd_smellalcohol, cpd_posxray, cpd_headache)
for cp in cancer_model.get_cpds():
    print(cp)

#进行预测HA发生的概率
from pgmpy.inference import VariableElimination
cancer_infer = VariableElimination(cancer_model)
q = cancer_infer.query(variables=['HA'])
print(q)
#诊断在某些证据下的概率
#from pgmpy.inference import VariableElimination
cancer_infer = VariableElimination(cancer_model)
q = cancer_infer.query(variables=['BT'], evidence={'PX': 1})
print(q)


Exemple #28
0
class Utilities(object):
    def __init__(self, file):
        ''' no object creation -> opportune  ?'''
        self.keywords = ['BENS', 'MEMS', 'LANS', 'MOTOR', 'WORLD']
        self.standard_nodes = {
            'RONS': {
                'BENS': [],
                'MEMS': []
            },
            'LANS': {
                'LANS': []
            },
            'LENS': {
                'MOTOR': [],
                'WORLD': []
            }
        }
        self.file = file
        self.get_json_path(file)
        self.pgmpy_object = BayesianModel()
        self.networkx_object = nx.DiGraph()
        self.header = ''
        self.dictionary = []

    def get_nodes_in_family(self, family, attributes=False):
        nw_nodes = self.networkx_object.nodes()
        nw_dim = np.asarray(nw_nodes).ndim
        nodes = []
        for i, node in enumerate(nw_nodes):
            if nw_dim > 1:
                node = node[0]
            if family in node:
                nodes.append(node)
        return nodes

    def check_json_path(directory):
        """
        Checks whether the necessary project_repository directory exists.
        If not, creates it

        :param directory: the mother directory to search from downwards

        :type directory: string
        :rtype : none
        """
        if not os.path.exists(directory + '\project_repository\\'):
            os.makedirs(directory + '\project_repository\\')

    def get_json_path(self, file):
        """
        Creates a string containing the full path for the filename passed
        so it will be saved in the project_repository directory

        :param filename: filename without path or extension
        :return: a full path for the file

        :type filename :string
        :rtype : string
        """
        levels = 5
        common = os.path.dirname(os.path.realpath(__file__))
        for i in range(levels + 1):
            common = os.path.dirname(common)
            if 'peepo\peepo' not in common:
                break
        Utilities.check_json_path(common)
        self.file = str(common + '\project_repository\\' + file + '.json')
        print('in get_json_path :', self.file)

    def save_json(self, astring):
        """
        This helping function is only needed to have the json file  formatted in a user friendly way
        as the "dump" method does not provide a lot of possibilities to get it "pretty"

        :param file :the ull path of the json file
        :param astring: the name of the string containing the whole information
        :return: void
        :type file: string
        :type astring : string
        :rtype : void
        """
        text_file = open(str(self.file), "w")
        '''remove all LF written by the dump method'''
        astring = re.sub('\n', '', astring)
        '''For keywords -> insert LF and tabs'''
        astring = re.sub('\"Identification', '\n\"Identification', astring)
        astring = re.sub('\"Date', '\n\"Date', astring)
        astring = re.sub('\"Description', '\n\"Description', astring)
        astring = re.sub('\"Train_from', '\n\"Train_from', astring)
        astring = re.sub('\"Frozen', '\n\"Frozen', astring)
        astring = re.sub('\"Nodes', '\n\n\"Nodes', astring)
        astring = re.sub('\"RONS', '\n\t\t\"RONS', astring)
        astring = re.sub('\"BENS', '\n\t\t\t\"BENS', astring)
        astring = re.sub('\"MEMS', '\n\t\t\t\"MEMS', astring)
        astring = re.sub('\"LANS', '\n\t\t\"LANS', astring)
        astring = re.sub('\"LENS', '\n\t\t\"LENS', astring)
        astring = re.sub('\"MOTOR', '\n\t\t\t\"MOTOR', astring)
        astring = re.sub('\"WORLD', '\n\t\t\t\"WORLD', astring)
        astring = re.sub('\"Edges', '\n\n\"Edges', astring)
        astring = re.sub('\"CPDs', '\n\n\"CPDs', astring)
        astring = re.sub('{', '\n\t\t{', astring)
        text_file.write(astring)
        text_file.write('\n')
        text_file.close()

    def translation(self, astring, from_man_to_machine):
        """
        Given an array of tuples (a,b) in dictionary, returns the second element of the tuple where astring was found
        Is used to not loose the users node names as peepo generates standardized names for the corresponding node

        :param dictionary:an array of tuples -> is created in the method : get_network(file)
        :param astring: the name of the node passsed by the user
        :param from_man_to_machine: an integer -> 0 when we want the translation for the user give name to the standardized name, 1 the other way around
        :return: the corresponding standardized node name
        :type dictionary: np.array
        :type astring : string
        :rtype : string
        """
        source = 0
        target = 1
        if from_man_to_machine == 1:
            source = 1
            target = 0

        for index, item in enumerate(self.dictionary):
            if item[source] == astring:
                break
        return item[target]

    def clean_edge_list(self, edge_array, parent):
        '''the get functions for the edges, both in networx as pgmpy contain the parent name
            this function removes it from the list'''
        cleaned_list = []
        for a in edge_array:
            if a != parent:
                cleaned_list.append(a)
        return cleaned_list

    def clean_parent_list(self, parent_array, child):
        '''the get functions for the edges, both in networx as pgmpy contain the parent name
            this function removes it from the list'''
        cleaned_list = []
        for i, a in enumerate(parent_array):
            if a[0] != child:
                cleaned_list.append(a[0])
        return cleaned_list

    def get_edges(self):
        """
        Creates a dictionary with a node as a key and an array with its child as value
        (the methods get_child give generally a list of tuples (parent,child)

        :param  pgmpy_object: the pgmpy network
        :return: a dictionary with the edges of all the node

        :type fpgmpy_object:adress
        :rtype :dictionary
                """
        edg = self.pgmpy_object.edges()
        edges = dict()
        [
            edges[str(t[0])].append(str(t[1])) if t[0] in list(edges.keys())
            else edges.update({str(t[0]): [str(t[1])]}) for t in edg
        ]
        return edges

    def get_nodes_and_attributes(self):
        """
        Creates an  array  of tuple with a node as element 0 and a dictionary with cardinalities and cpd as key's and
         the key cardinality returns an int
         the key cpd a 2 dimensional matrix

        :param  pgmpy_object: the pgmpy network
        :return: array  of tuple with a node as element 0 and a dictionary with cardinalities and cpd as key's

        :type  :pgmpy_object:adress
        :rtype :array of tuples
        """
        nodes = self.pgmpy_object.nodes()
        nod_and_attributes = []
        [
            nod_and_attributes.append((str(node), {
                'cardinality':
                int(self.pgmpy_object.get_cardinality(node)),
                'cpd':
                self.pgmpy_object.get_cpds(node).values.astype(float)
            })) for i, node in enumerate(nodes)
        ]
        #need to reshape the cpds when more than 1 parent
        for i, node in enumerate(nod_and_attributes):
            shape = nod_and_attributes[i][1]['cpd'].shape
            dimension = nod_and_attributes[i][1]['cpd'].ndim
            if dimension > 2:
                col = int(np.prod(shape) / shape[0])
                nod_and_attributes[i][1]['cpd'] = nod_and_attributes[i][1][
                    'cpd'].reshape(shape[0], col)
            nod_and_attributes[i][1]['cpd'] = nod_and_attributes[i][1][
                'cpd'].tolist()
        return nod_and_attributes

    def translate_pgmpy_to_digraph(self):
        """
        Converts a pgmpy network into a networkx network

        :param  pgmpy_object: the pgmpy network
        :return networkx : networkx network

        :type  :pgmpy_object:adress
        :rtype :networkx:adress
        """
        self.networkx_object = nx.DiGraph()
        edges = self.pgmpy_object.edges()
        nodes_and_attributes = self.get_nodes_and_attributes()
        self.networkx_object.add_nodes_from(nodes_and_attributes)
        self.networkx_object.add_edges_from(edges)
        return

    def update_networkx(self, networkx, dic, header):
        self.header = header
        self.dictionary = dic
        self.networkx_object = networkx

    def update_pgmpy(self, pgmpy, dic, header):
        self.header = header
        self.dictionary = dic
        self.pgmpy_object = pgmpy

    def save_pgmpy_network(self):
        """
                Saves the passed pgmpy_object class object in a json file
        """
        self.translate_pgmpy_to_digraph()
        self.save_network()
        return

    def translate_digraph_to_pgmpy(self, digraf):
        """
        Converts a pgmpy network into a networkx network

        :param  pgmpy_object: the pgmpy network
        :return networkx : networkx network

        :type  :pgmpy_object:adress
        :rtype :networkx:adress
        """
        self.pgmpy_object, x, y = self.get_pgmpy_network(from_object=True,
                                                         digraph=digraf)
        return self.pgmpy_object

    def translate_pgmpy_to_digraph(self):
        """
        Converts a pgmpy network into a networkx network

        :param  pgmpy_object: the pgmpy network
        :return networkx : networkx network

        :type  :pgmpy_object:adress
        :rtype :networkx:adress
        """
        self.networkx_object = nx.DiGraph()
        edges = self.pgmpy_object.edges()
        nodes_and_attributes = self.get_nodes_and_attributes()
        self.networkx_object.add_nodes_from(nodes_and_attributes)
        self.networkx_object.add_edges_from(edges)
        return

    def save_network(self):
        """
        Saves the passed networkx class object in a json file

        """
        data = self.get_empty_canvas()
        data["header"] = self.header
        nw_nodes = self.networkx_object.nodes(data=True)
        nw_edges = self.networkx_object.edges()
        keywords = self.keywords
        nodes = copy.deepcopy(
            self.standard_nodes
        )  #{'RONS': {'BENS': [], 'MEMS': []}, 'LANS': {'LANS': []}, 'LENS': {'MOTOR': [], 'WORLD': []}}
        edges = []
        cpds = []
        '''adding edges'''
        for i, node in enumerate(nw_nodes):
            node_name = node[0]
            childs = []
            for k, edge in enumerate(nw_edges):
                if edge[0] == node_name:
                    childs.append(self.translation(edge[1], 1))
            if len(childs) != 0:
                edges.append({self.translation(node_name, 1): childs})

        for i, node in enumerate(nw_nodes):
            node_name = node[0]
            cardinality = node[1]['cardinality']
            cpd = node[1]['cpd']
            for pseudonym in keywords:
                if pseudonym in node_name:
                    node_name_ = self.translation(node_name, 1)
                    if pseudonym == 'BENS' or pseudonym == 'MEMS':
                        nodes['RONS'][pseudonym].append(
                            [node_name_, cardinality])
                    if pseudonym == 'LANS':
                        nodes['LANS'][pseudonym].append(
                            [node_name_, cardinality])
                    if pseudonym == 'MOTOR' or pseudonym == 'WORLD':
                        nodes['LENS'][pseudonym].append(
                            [node_name_, cardinality])
            cpds.append({self.translation(node_name, 1): cpd})
        data['Nodes'] = nodes
        data['Edges'] = edges
        data['CPDs'] = cpds
        data['header']['Date'] = datetime.datetime.now().strftime("%c")
        self.save_json(json.dumps(data))
        return

    def get_pgmpy_network(self, from_object=False, digraph=None):
        """
        Reads the passed json file and translates it's content to the passed pgmpy class object
        - uses the get_network(file) to read the json file in a networkx format and translate this to pgmpy
        - Creates a dictionary for the nodes in the form of an array of tuples : [(names defines by user, standard name)]

        :param file: : filename without path or extension
        :pgmp_object : the pgmpy object which will be completed
        :return: a dictionary as an array of tuples and the header of the json file

        :type file : string
        :type pgmp_object : pgmpy class object
        :rtype : array of tuples, dictionary

        CAUTION : the method does not perform a check() on the constructed DAG ! -> has to be done in the calling module
        """
        self.pgmpy_object = BayesianModel()
        if not (from_object):
            network, dictionary, header = self.get_network()
        else:
            network = digraph
        nw_nodes = network.nodes(data=True)
        nw_edges = network.edges()
        '''adding nnodes and edges'''
        for i, node in enumerate(nw_nodes):
            node_name = node[0]
            self.pgmpy_object.add_node(node_name)
            for k, edge in enumerate(nw_edges):
                if edge[0] == node_name:
                    self.pgmpy_object.add_edge(node_name, edge[1])
        '''add  cpd's'''
        for i, node in enumerate(nw_nodes):
            parent_nodes = network.in_edges(node[0])
            parent_nodes = self.clean_parent_list(parent_nodes, node[0])
            cpd = node[1]['cpd']
            ''' find the cardinality of the node '''
            cardinality_node = node[1]['cardinality']
            """  cardinality card of parents has to be determined"""
            cardinality_parents = []
            for i, nod in enumerate(parent_nodes):
                cardinality_parents.append(network.node[nod]['cardinality'])
            ''' Depending on the place in the BN and/or the number of parents  the PGMPY CPD methods have another call'''
            if len(cardinality_parents) == 0:
                self.pgmpy_object.add_cpds(
                    TabularCPD(variable=node[0],
                               variable_card=cardinality_node,
                               values=[cpd]))
                continue
            table = TabularCPD(variable=node[0], variable_card= cardinality_node, values=cpd, \
                              evidence=parent_nodes,\
                              evidence_card=np.asarray(cardinality_parents))
            self.pgmpy_object.add_cpds(table)
        '''------TO DELETE-------------'''
        # pgmpy_object.check_model()
        # draw_network(pgmpy_object)
        '''-----------------------------'''
        return self.pgmpy_object, self.dictionary, self.header

    def get_network(self):
        """
        Reads the passed json file and translate it's content in a networkx class object
        - The nodes in the object are renamed so they have a standardized signature
        - Creates a dictionary for the nodes in the form of an array of tuples : [(names defines by user, standard name)]

        :param file: : filename without path or extension
        :return: a networkx class object, dictionary as an array of tuples and the header of the json file

        :type file : string
        :rtype : networkx class object, array of tuples, dictionary
        """
        self.dictionary = []
        self.networkx_object = nx.DiGraph()
        with open(self.file) as f:
            data = f.read()
        '''Remove possible non informative characters'''
        data = re.sub('\n', '', data)
        data = re.sub('\t', '', data)
        data = json.loads(data)
        self.header = data['header']
        '''Feeding G with the nodes'''
        cardinality = {}
        for key in data['Nodes'].keys():
            for secondkey in data['Nodes'][key].keys():
                for c, n in enumerate(data['Nodes'][key][secondkey]):
                    node = secondkey + "_" + str(c)
                    self.networkx_object.add_node(node, {
                        'cardinality': n[1],
                        'cpd': []
                    })
                    self.dictionary.append((n[0], node))
                    cardinality.update(
                        {node: n[1]}
                    )  #this contains the cardinality of each node with the node name as dictionary entry
        '''Feeding G with the edges'''
        edges = []
        for j, pair in enumerate(data['Edges']):
            for parent in pair.keys():
                for child in data['Edges'][j][parent]:
                    parent_ = self.translation(parent, 0)
                    child_ = self.translation(child, 0)
                    edges.append((parent_, child_))
        np.ravel(edges)
        self.networkx_object.add_edges_from(edges)
        '''Feeding G with the  CPD's as nodes attributes'''
        for j, node in enumerate(data['CPDs']):
            for parent, cpd in node.items():
                node_ = self.translation(parent, 0)
                self.networkx_object.node[node_]['cpd'] = cpd
        '''TO REMOVE LATER'''
        # plt.figure(figsize=(10, 5))
        # pos = nx.circular_layout(G, scale=2)
        # node_labels = nx.get_node_attributes(G, 'cpd')
        # nx.draw(G, pos, node_size=1200, node_color='lightblue',
        #         linewidths=0.25,  font_size=10, font_weight='bold', with_labels=True)
        # plt.show()
        return self.networkx_object, self.dictionary, self.header

    def create_json_file(self, **kwargs):
        """
        EWAMPLE :

        A helping method if the user prefers to create the BN within the code

        :param case_name: the file name without path or extension where the json file will be saved
        :param : **kwargs takes the following variables:
                                                            description = kwargs.get('description', '')
                                                            train_from = kwargs.get('train_from', '')
                                                            cpds = kwargs.get('CPDs', [])
                                                            bens = kwargs.get('BENS',[])
                                                            mems = kwargs.get('MEMS', [])
                                                            lans = kwargs.get('LANS', [])
                                                            motors = kwargs.get('MOTORS', [])
                                                            world = kwargs.get('WORLD', [])
                                                            edges = kwargs.get('Edges', [])
                                                            frozen = kwargs.get('frozen',False)
        .
        .
        .
        :return: void

        :type case_name : string
        :type  :
        .
        .
        .
        :rtype : void
        """
        description = kwargs.get('description', '')
        train_from = kwargs.get('train_from', '')
        cpds = kwargs.get('CPDs', [])
        bens = kwargs.get('BENS', [])
        mems = kwargs.get('MEMS', [])
        lans = kwargs.get('LANS', [])
        motors = kwargs.get('MOTORS', [])
        world = kwargs.get('WORLD', [])
        edges = kwargs.get('Edges', [])
        frozen = kwargs.get('frozen', False)

        #json_tab_file_write = JSONTabIndentFileWriter( Case_name,5a)
        data = self.get_empty_canvas()
        '''       - the 3 next items are for tracking purpose only, not fundamentally necessary'''
        data["header"]['Identification'] = self.file
        data["header"]['Date'] = datetime.datetime.now().strftime("%c")
        data["header"]['Description'] = description
        '''       - the next item gives a file containing possible training data (OPTIONAL)'''
        data["header"]['Train_from'] = train_from
        '''      Frozen tells whether or not the model can be considered as final i.e. is there still "training" needed'''
        data["header"]['Frozen'] = frozen
        '''       - the 5 next lines tells how much nodes  and their names + cardinality the model will start with
                    the names can be any valid python string'''
        bens = [['pooping', 2], ['peeing', 2], ['constipated', 2]]
        mems = [['havenotoiletpaper', 2]]
        lans = [['diarhea', 2], ['happypoop', 2]]
        motors = [['asshole1', 2], ['asshole2', 2]]
        world = [['toilet1', 2], ['toilet2', 2], ['garden1', 2],
                 ['garden2', 2], ['doctor', 2]]
        '''     - the next items describe the edges as a dictionary
                 -> the dictionary entry is always one of the rootnodes, the array following can only contain LANs or LENs'''
        edges = []
        '''       !! in case we start from scratch and we rely on peepo to find the best BN -> leave this array empty'''
        edges.append({'pooping': ['toilet1', 'diarhea', 'happypoop']})
        edges.append({'peeing': ['toilet2', 'garden1', 'garden2']})
        edges.append({'constipated': ['doctor']})
        edges.append({'havenotoiletpaper': ['garden1', 'garden2']})
        edges.append(
            {'diarhea': ['toilet1', 'doctor', 'asshole1', 'asshole2']})
        edges.append(
            {'happypoop': ['garden1', 'garden2', 'asshole1', 'asshole2']})
        '''       - the next items describe the CPD's  as a dictionary
                  -> the dictionary entry is the corresponding node'''
        cpds = []
        cpds.append({'pooping': [0.5, 0.5]})
        cpds.append({'peeing': [0.2, 0.8]})
        cpds.append({'constipated': [0.9, 0.1]})
        cpds.append({'havenotoiletpaper': [0.6, 0.4]})
        cpds.append({'happypoop': [[0.3, 0.8], [0.7, 0.2]]})
        cpds.append({'diarhea': [[0.8, 0.3], [0.2, 0.7]]})
        cpds.append({'toilet1': [[0.3, 0.8, 0.8, 0.7], [0.7, 0.2, 0.2, 0.3]]})
        cpds.append({'asshole1': [[0.3, 0.8, 0.8, 0.7], [0.7, 0.2, 0.2, 0.3]]})
        cpds.append({'asshole2': [[0.3, 0.8, 0.8, 0.7], [0.7, 0.2, 0.2, 0.3]]})
        cpds.append({'toilet2': [[0.5, 0.5], [0.5, 0.5]]})
        cpds.append({'doctor': [[0.3, 0.8, 0.8, 0.7], [0.7, 0.2, 0.2, 0.3]]})
        cpds.append({
            'garden1': [[0.3, 0.8, 0.8, 0.7, 0.8, 0.2, 0.5, 0.5],
                        [0.7, 0.2, 0.2, 0.3, 0.2, 0.8, 0.5, 0.5]]
        })
        cpds.append({
            'garden2': [[0.3, 0.8, 0.8, 0.7, 0.8, 0.2, 0.5, 0.5],
                        [0.7, 0.2, 0.2, 0.3, 0.2, 0.8, 0.5, 0.5]]
        })
        '''       - feeding the data'''
        data["Nodes"]['RONS']['BENS'] = bens
        data["Nodes"]['RONS']['MEMS'] = mems
        data["Nodes"]['LANS']['LANS'] = lans
        data["Nodes"]['LENS']['MOTOR'] = motors
        data["Nodes"]['LENS']['WORLD'] = world
        data["Edges"] = edges
        data["CPDs"] = cpds
        ''' dumping to CASENAME file in jason format'''
        self.save_json(json.dumps(data))

        print("Json file for  - ", self.file, "  - created")

    def create_json_template(self):
        """
        A helping method if the  jason template in the project_repository ditectory has been deleted or corrupted

        :param : void
        :return: void

        :type : void
        :rtype : void
        """
        self.get_json_path(
            "Template"
        )  # creates the right path in which case_name will be saved
        data = self.get_empty_canvas()
        data['header']['Identification'] = self.file
        '''Filling some dummies to facilitate the user'''
        a_node = ['*', 0]
        an_edge = {'*': ['&', '&', '&']}
        a_cpd = {'*': [[0, 0, 0], [0, 0, 0]]}
        nodes = []
        edges = []
        cpds = []
        for i in range(0, 3):
            nodes.append(a_node)
            edges.append(an_edge)
            cpds.append(a_cpd)

        data['Nodes']['RONS']['BENS'] = nodes
        data['Nodes']['RONS']['MEMS'] = nodes
        data['Nodes']['LANS']['LANS'] = nodes
        data['Nodes']['LENS']['MOTOR'] = nodes
        data['Nodes']['LENS']['WORLD'] = nodes
        data['Edges'] = edges
        data['CPDs'] = cpds
        ''' dumping to CASENAME file in jason format'''
        # with open(case_name, 'w') as f:
        #     json.dump(data, f, separators = (",",":"))
        self.save_json(json.dumps(data))
        print("Empty template created")

    def get_empty_canvas(self):
        """
         This method creates a json canvas which will be used for the several json creating method

         :param : void
         :return: a dictionary with the structure of the json file
         :type : non
         :rtype : dictionary
         """

        data = {
            'header': {
                'Identification': '',
                'Date': '',
                'Description': '',
                'Frozen': '',
                'Train_from': ''
            },
            'Nodes': {},
            'Edges': [],
            'CPDs': []
        }
        '''       - the 5 next lines tells how much nodes  and their names the model will start with
                    the names can be any valid python string'''
        bens = []
        mems = []
        lans = []
        motors = []
        world = []
        '''     - the next items describe the edges as a dictionary
                 -> the dictionary entry is always one of the rootnodes, the array following can only contain LANs or LENs

                 !! in case we start from scratch and we rely on peepo to find the best BN -> leave this array empty'''
        edges = []
        '''       - the next items describe the CPD's  as a dictionary
                  -> the dictionary entry is the corresponding node'''
        cpds = []
        '''       - feeding the data'''
        data['Nodes'] = {
            'RONS': {
                'BENS': bens,
                'MEMS': mems
            },
            'LANS': {
                'LANS': lans
            },
            'LENS': {
                'MOTOR': motors,
                'WORLD': world
            }
        }
        data['Edges'] = edges
        data['CPDs'] = cpds
        return data
class TestBayesianModelFitPredict(unittest.TestCase):

    def setUp(self):
        self.model_disconnected = BayesianModel()
        self.model_disconnected.add_nodes_from(['A', 'B', 'C', 'D', 'E'])
        self.model_connected = BayesianModel([('A', 'B'), ('C', 'B'), ('C', 'D'), ('B', 'E')])

        self.model2 = BayesianModel([('A', 'C'), ('B', 'C')])
        self.data1 = pd.DataFrame(data={'A': [0, 0, 1], 'B': [0, 1, 0], 'C': [1, 1, 0]})
        self.data2 = pd.DataFrame(data={'A': [0, np.NaN, 1],
                                        'B': [0, 1, 0],
                                        'C': [1, 1, np.NaN],
                                        'D': [np.NaN, 'Y', np.NaN]})

        # data_link - "https://www.kaggle.com/c/titanic/download/train.csv"
        self.titanic_data = pd.read_csv('pgmpy/tests/test_estimators/testdata/titanic_train.csv', dtype=str)
        self.titanic_data2 = self.titanic_data[["Survived", "Sex", "Pclass"]]

    def test_bayesian_fit(self):
        print(isinstance(BayesianEstimator, BaseEstimator))
        print(isinstance(MaximumLikelihoodEstimator, BaseEstimator))
        self.model2.fit(self.data1, estimator=BayesianEstimator, prior_type="dirichlet", pseudo_counts=[9, 3])
        self.assertEqual(self.model2.get_cpds('B'), TabularCPD('B', 2, [[11.0 / 15], [4.0 / 15]]))

    def test_fit_missing_data(self):
        self.model2.fit(self.data2, state_names={'C': [0, 1]}, complete_samples_only=False)
        cpds = set([TabularCPD('A', 2, [[0.5], [0.5]]),
                    TabularCPD('B', 2, [[2. / 3], [1. / 3]]),
                    TabularCPD('C', 2, [[0, 0.5, 0.5, 0.5], [1, 0.5, 0.5, 0.5]],
                               evidence=['A', 'B'], evidence_card=[2, 2])])
        self.assertSetEqual(cpds, set(self.model2.get_cpds()))

    def test_disconnected_fit(self):
        values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 5)),
                              columns=['A', 'B', 'C', 'D', 'E'])
        self.model_disconnected.fit(values)

        for node in ['A', 'B', 'C', 'D', 'E']:
            cpd = self.model_disconnected.get_cpds(node)
            self.assertEqual(cpd.variable, node)
            np_test.assert_array_equal(cpd.cardinality, np.array([2]))
            value = (values.ix[:, node].value_counts() /
                     values.ix[:, node].value_counts().sum())
            value = value.reindex(sorted(value.index)).values
            np_test.assert_array_equal(cpd.values, value)

    def test_predict(self):
        titanic = BayesianModel()
        titanic.add_edges_from([("Sex", "Survived"), ("Pclass", "Survived")])
        titanic.fit(self.titanic_data2[500:])

        p1 = titanic.predict(self.titanic_data2[["Sex", "Pclass"]][:30])
        p2 = titanic.predict(self.titanic_data2[["Survived", "Pclass"]][:30])
        p3 = titanic.predict(self.titanic_data2[["Survived", "Sex"]][:30])

        p1_res =  np.array(['0', '1', '0', '1', '0', '0', '0', '0', '0', '1', '0', '1', '0',
                            '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
                            '0', '0', '0', '0'])
        p2_res = np.array(['male', 'female', 'female', 'female', 'male', 'male', 'male',
                           'male', 'female', 'female', 'female', 'female', 'male', 'male',
                           'male', 'female', 'male', 'female', 'male', 'female', 'male',
                           'female', 'female', 'female', 'male', 'female', 'male', 'male',
                           'female', 'male'])
        p3_res = np.array(['3', '1', '1', '1', '3', '3', '3', '3', '1', '1', '1', '1', '3',
                           '3', '3', '1', '3', '1', '3', '1', '3', '1', '1', '1', '3', '1',
                           '3', '3', '1', '3'])

        np_test.assert_array_equal(p1.values.ravel(), p1_res)
        np_test.assert_array_equal(p2.values.ravel(), p2_res)
        np_test.assert_array_equal(p3.values.ravel(), p3_res)

    def test_connected_predict(self):
        np.random.seed(42)
        values = pd.DataFrame(np.array(np.random.randint(low=0, high=2, size=(1000, 5)),
                                       dtype=str),
                              columns=['A', 'B', 'C', 'D', 'E'])
        fit_data = values[:800]
        predict_data = values[800:].copy()
        self.model_connected.fit(fit_data)
        self.assertRaises(ValueError, self.model_connected.predict, predict_data)
        predict_data.drop('E', axis=1, inplace=True)
        e_predict = self.model_connected.predict(predict_data)
        np_test.assert_array_equal(e_predict.values.ravel(),
                                   np.array([1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1,
                                             1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0,
                                             0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0,
                                             0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
                                             0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1,
                                             1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1,
                                             1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0,
                                             1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1,
                                             0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1,
                                             1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1,
                                             1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1,
                                             0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0,
                                             1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1,
                                             1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1,
                                             1, 1, 1, 0], dtype=str))

    def test_connected_predict_probability(self):
        np.random.seed(42)
        values = pd.DataFrame(np.random.randint(low=0, high=2, size=(100, 5)),
                              columns=['A', 'B', 'C', 'D', 'E'])
        fit_data = values[:80]
        predict_data = values[80:].copy()
        self.model_connected.fit(fit_data)
        predict_data.drop('E', axis=1, inplace=True)
        e_prob = self.model_connected.predict_probability(predict_data)
        np_test.assert_allclose(e_prob.values.ravel(),
                                    np.array([0.57894737,  0.42105263,  0.57894737,  0.42105263,  0.57894737,
                                             0.42105263,  0.5       ,  0.5       ,  0.57894737,  0.42105263,
                                             0.5       ,  0.5       ,  0.57894737,  0.42105263,  0.57894737,
                                             0.42105263,  0.57894737,  0.42105263,  0.5       ,  0.5       ,
                                             0.57894737,  0.42105263,  0.57894737,  0.42105263,  0.5       ,
                                             0.5       ,  0.57894737,  0.42105263,  0.57894737,  0.42105263,
                                             0.5       ,  0.5       ,  0.57894737,  0.42105263,  0.5       ,
                                             0.5       ,  0.5       ,  0.5       ,  0.5       ,  0.5       ]), atol = 0)
        predict_data = pd.DataFrame(np.random.randint(low=0, high=2, size=(1, 5)),
                              columns=['A', 'B', 'C', 'F', 'E'])[:]

    def test_predict_probability_errors(self):
        np.random.seed(42)
        values = pd.DataFrame(np.random.randint(low=0, high=2, size=(2, 5)),
                              columns=['A', 'B', 'C', 'D', 'E'])
        fit_data = values[:1]
        predict_data = values[1:].copy()
        self.model_connected.fit(fit_data)
        self.assertRaises(ValueError, self.model_connected.predict_probability, predict_data)
        predict_data = pd.DataFrame(np.random.randint(low=0, high=2, size=(1, 5)),
                              columns=['A', 'B', 'C', 'F', 'E'])[:]
        self.assertRaises(ValueError, self.model_connected.predict_probability, predict_data)

    def tearDown(self):
        del self.model_connected
        del self.model_disconnected
def main():

    andPGM = PGM_t()
    print('loading features..')
    train_set, test_set = andPGM.load_features()
    print('loading features.. Done')
    # Bayesian network of 19 nodes, 9*2 variables of network given
    # Initial incomplete Bayesian model connected manually based on intuition
    print('Generating model.. ')
    initialModel = BayesianModel({})
    initialModel.add_nodes_from(andPGM.img_features.columns[1:10].tolist())
    initialModel.add_edges_from([('f6_a' , 'f2_a'),\
                             ('f3_a' , 'f4_a') ,\
                             ('f5_a' , 'f9_a') ,\
                             ('f4_a' , 'f7_a') ])

    # Use hill climb search algorithm to find network structure of initial 9 nodes
    hc = HillClimbSearch(data=andPGM.img_features.iloc[0:,1:10], \
                         scoring_method=BdeuScore(andPGM.img_features.iloc[0:,1:10], \
                                                  equivalent_sample_size=0.1*len(andPGM.img_features)), \
                         state_names = andPGM.states_9)
    # Get best estimated structure
    best_model = hc.estimate(start=initialModel)
    # Edges in the acquired graph
    print('model of 9 var: ', best_model.edges())

    # Create a Clone of generated Bayesian network structure
    clone_model = BayesianModel({})
    for edge in best_model.edges():
        new_edge = [edge[0][:-1] + 'b', edge[1][:-1] + 'b']
        clone_model.add_edges_from([new_edge])

    # Join together the Original and clone network through node 'same'
    multinetModel = BayesianModel({})
    multinetModel.add_edges_from(best_model.edges() + clone_model.edges())
    multinetModel.add_node('same')
    multinetModel.add_edge('f5_a', 'same')
    multinetModel.add_edge('f9_a', 'same')
    multinetModel.add_edge('f5_b', 'same')
    multinetModel.add_edge('f9_b', 'same')
    print('Generating model.. Done')
    # Edges in the final structure
    print('Final model: ', multinetModel.edges())

    print('Fit data into model..')
    # fit the data to model to generate CPDs using maximum likelyhood estimation
    multinetModel.fit(data=train_set, state_names=andPGM.states_all)
    print('Fit data into model.. Done')
    print('CPDs generated: ')
    cpds = multinetModel.get_cpds()
    for cpd in cpds:
        print(cpd)
    # Inference using Variable Elimination
    print('Start inference..')
    inference = VariableElimination(multinetModel)
    train_set_same = train_set[train_set['same'] == 0]
    train_set_not_same = train_set[train_set['same'] == 1]

    # Accuracy of positive inferences
    acc_same = andPGM.chk_accuracy(
        train_set_same,
        inference,
        variables=train_set_same.columns[0:9].tolist(),
        evidence=train_set_same.columns[9:19].tolist())
    print('accuracy of positives ', acc_same)

    # Accuracy of negative inferences
    acc_nt_same = andPGM.chk_accuracy(
        train_set_not_same,
        inference,
        variables=train_set_not_same.columns[0:9].tolist(),
        evidence=train_set_not_same.columns[9:19].tolist())
    print('accuracy of negatives', acc_nt_same)
import numpy as np
import pandas as pd
from pgmpy.models import BayesianModel
from pgmpy.estimators import MaximumLikelihoodEstimator
# Generating some random data
raw_data = np.random.randint(low=0, high=2, size=(100, 2))
print(raw_data)
data = pd.DataFrame(raw_data, columns=['X', 'Y'])
print(data)

# Two coin tossing model assuming that they are dependent.
coin_model = BayesianModel([('X', 'Y')])
coin_model.fit(data, estimator=MaximumLikelihoodEstimator)
cpd_x = coin_model.get_cpds('X')
print(cpd_x)
def bayesian_net():
    musicianship_model = BayesianModel([('Difficulty', 'Rating'),
                                        ('Musicianship', 'Rating'),
                                        ('Musicianship', 'Exam'),
                                        ('Rating', 'Letter')])
    cpd_diff = TabularCPD(variable='Difficulty',
                          variable_card=2,
                          values=[[0.6], [0.4]])  #0->Low, 1->High
    cpd_music = TabularCPD(variable='Musicianship',
                           variable_card=2,
                           values=[[0.7], [0.3]])  #0->Weak 1->Strong
    cpd_rating = TabularCPD(variable='Rating',
                            variable_card=3,
                            values=[[0.3, 0.05, 0.9, 0.5],
                                    [0.4, 0.25, 0.08, 0.3],
                                    [0.3, 0.7, 0.02, 0.2]],
                            evidence=['Difficulty', 'Musicianship'],
                            evidence_card=[2, 2])  #0->* 1->** 2-->***
    cpd_exam = TabularCPD(variable='Exam',
                          variable_card=2,
                          values=[[0.95, 0.2], [0.05, 0.8]],
                          evidence=['Musicianship'],
                          evidence_card=[2])  #0-->Low 1-->High

    cpd_letter = TabularCPD(variable='Letter',
                            variable_card=2,
                            values=[[0.1, 0.4, 0.99], [0.9, 0.6, 0.01]],
                            evidence=['Rating'],
                            evidence_card=[3])  #0-->Weak 1-->Strong

    musicianship_model.add_cpds(cpd_diff, cpd_music, cpd_rating, cpd_exam,
                                cpd_letter)
    musicianship_model.check_model()

    infer = SimpleInference(musicianship_model)  # query without normalization

    print('------------------------')
    print(' EXACT INFERENCE')
    print('------------------------')
    print('--------------------')
    print(
        ' QUERY Letter with evidence Difficulty: 0, Musicianship: 1, Rating: 1, Exam:1  NOT NORMALIZED'
    )
    print('--------------------')
    print(
        infer.query(['Letter'],
                    evidence={('Difficulty', 0), ('Musicianship', 1),
                              ('Rating', 1), ('Exam', 1)}))
    print('--------------------')
    print(
        ' QUERY Letter with evidence Difficulty: 0, Musicianship: 1, Rating: 1, Exam:1  NORMALIZED'
    )
    print('--------------------')
    infer = VariableElimination(musicianship_model)  # query normalized
    print(
        infer.query(['Letter'],
                    evidence={
                        'Difficulty': 0,
                        'Musicianship': 1,
                        'Rating': 1,
                        'Exam': 1
                    })['Letter'])

    print('--------------------')
    print(' QUERY Letter with no evidence')
    print('--------------------')
    print(infer.query(['Letter'])['Letter'])
    print('--------------------')
    print(' QUERY Letter with evidence Musicianship: 0  NORMALIZED')
    print('--------------------')
    print(infer.query(['Letter'], evidence={'Musicianship': 0})['Letter'])

    sampling = BayesianModelSampling(musicianship_model)
    data = sampling.likelihood_weighted_sample(evidence={},
                                               size=2000,
                                               return_type='dataframe')

    musicianship_model_bis = BayesianModel([('Difficulty', 'Rating'),
                                            ('Musicianship', 'Rating'),
                                            ('Rating', 'Letter'),
                                            ('Musicianship', 'Exam')])
    musicianship_model_bis.fit(data, estimator=BayesianEstimator)
    musicianship_model_bis.check_model()
    infer = VariableElimination(musicianship_model_bis)  # query normalized
    for cpd in musicianship_model_bis.get_cpds():
        print("CPD of {variable}:".format(variable=cpd.variable))
        print(cpd)

    print('------------------------')
    print(' APPROXIMATE INFERENCE')
    print('------------------------')

    print('--------------------')
    print(
        ' QUERY Letter with evidence Difficulty: 0, Musicianship: 1, Rating: 1, Exam:1  NORMALIZED'
    )
    print('--------------------')

    print(
        infer.query(['Letter'],
                    evidence={
                        'Difficulty': 0,
                        'Musicianship': 1,
                        'Rating': 1,
                        'Exam': 1
                    })['Letter'])

    print('--------------------')
    print(' QUERY Letter with no evidence')
    print('--------------------')
    print(infer.query(['Letter'])['Letter'])
    print('--------------------')
    print(' QUERY Letter with evidence Musicianship: 0  NORMALIZED')
    print('--------------------')
    print(infer.query(['Letter'], evidence={'Musicianship': 0})['Letter'])
Exemple #33
0
class TestBayesianModelCPD(unittest.TestCase):
    def setUp(self):
        self.G = BayesianModel([('d', 'g'), ('i', 'g'), ('g', 'l'),
                                ('i', 's')])

    def test_active_trail_nodes(self):
        self.assertEqual(sorted(self.G.active_trail_nodes('d')), ['d', 'g', 'l'])
        self.assertEqual(sorted(self.G.active_trail_nodes('i')), ['g', 'i', 'l', 's'])

    def test_active_trail_nodes_args(self):
        self.assertEqual(sorted(self.G.active_trail_nodes('d', observed='g')), ['d', 'i', 's'])
        self.assertEqual(sorted(self.G.active_trail_nodes('l', observed='g')), ['l'])
        self.assertEqual(sorted(self.G.active_trail_nodes('s', observed=['i', 'l'])), ['s'])
        self.assertEqual(sorted(self.G.active_trail_nodes('s', observed=['d', 'l'])), ['g', 'i', 's'])

    def test_is_active_trail_triplets(self):
        self.assertTrue(self.G.is_active_trail('d', 'l'))
        self.assertTrue(self.G.is_active_trail('g', 's'))
        self.assertFalse(self.G.is_active_trail('d', 'i'))
        self.assertTrue(self.G.is_active_trail('d', 'i', observed='g'))
        self.assertFalse(self.G.is_active_trail('d', 'l', observed='g'))
        self.assertFalse(self.G.is_active_trail('i', 'l', observed='g'))
        self.assertTrue(self.G.is_active_trail('d', 'i', observed='l'))
        self.assertFalse(self.G.is_active_trail('g', 's', observed='i'))

    def test_is_active_trail(self):
        self.assertFalse(self.G.is_active_trail('d', 's'))
        self.assertTrue(self.G.is_active_trail('s', 'l'))
        self.assertTrue(self.G.is_active_trail('d', 's', observed='g'))
        self.assertFalse(self.G.is_active_trail('s', 'l', observed='g'))

    def test_is_active_trail_args(self):
        self.assertFalse(self.G.is_active_trail('s', 'l', 'i'))
        self.assertFalse(self.G.is_active_trail('s', 'l', 'g'))
        self.assertTrue(self.G.is_active_trail('d', 's', 'l'))
        self.assertFalse(self.G.is_active_trail('d', 's', ['i', 'l']))

    def test_get_cpds(self):
        cpd_d = TabularCPD('d', 2, np.random.rand(2, 1))
        cpd_i = TabularCPD('i', 2, np.random.rand(2, 1))
        cpd_g = TabularCPD('g', 2, np.random.rand(2, 4), ['d', 'i'], [2, 2])
        cpd_l = TabularCPD('l', 2, np.random.rand(2, 2), ['g'], 2)
        cpd_s = TabularCPD('s', 2, np.random.rand(2, 2), ['i'], 2)
        self.G.add_cpds(cpd_d, cpd_i, cpd_g, cpd_l, cpd_s)

        self.assertEqual(self.G.get_cpds('d').variable, 'd')

    def test_get_cpds1(self):
        self.model = BayesianModel([('A', 'AB')])
        cpd_a = TabularCPD('A', 2, np.random.rand(2, 1))
        cpd_ab = TabularCPD('AB', 2, np.random.rand(2, 2), evidence=['A'],
                            evidence_card=[2])

        self.model.add_cpds(cpd_a, cpd_ab)
        self.assertEqual(self.model.get_cpds('A').variable, 'A')
        self.assertEqual(self.model.get_cpds('AB').variable, 'AB')

    def test_add_single_cpd(self):
        from pgmpy.factors import TabularCPD
        cpd_s = TabularCPD('s', 2, np.random.rand(2, 2), ['i'], 2)
        self.G.add_cpds(cpd_s)
        self.assertListEqual(self.G.get_cpds(), [cpd_s])

    def test_add_multiple_cpds(self):
        from pgmpy.factors import TabularCPD
        cpd_d = TabularCPD('d', 2, np.random.rand(2, 1))
        cpd_i = TabularCPD('i', 2, np.random.rand(2, 1))
        cpd_g = TabularCPD('g', 2, np.random.rand(2, 4), ['d', 'i'], [2, 2])
        cpd_l = TabularCPD('l', 2, np.random.rand(2, 2), ['g'], 2)
        cpd_s = TabularCPD('s', 2, np.random.rand(2, 2), ['i'], 2)

        self.G.add_cpds(cpd_d, cpd_i, cpd_g, cpd_l, cpd_s)
        self.assertEqual(self.G.get_cpds('d'), cpd_d)
        self.assertEqual(self.G.get_cpds('i'), cpd_i)
        self.assertEqual(self.G.get_cpds('g'), cpd_g)
        self.assertEqual(self.G.get_cpds('l'), cpd_l)
        self.assertEqual(self.G.get_cpds('s'), cpd_s)

    def tearDown(self):
        del self.G
Exemple #34
0
Created on Oct 27, 2017

@author: Adele
'''

import numpy as np
import pandas

data = pandas.read_csv("kaggle.csv")

data2 = data[["Survived", "Sex", "Pclass"]]
#data2 = data[["Survived", "Sex", "Pclass"]].replace(["female", "male"], [0, 1]).replace({"Pclass": {3: 0}})

intrain = np.random.rand(len(data2)) < 0.8

dtrain = data2[intrain]
dtest = data2[~intrain]

##print(len(dtrain), len(dtest))

from pgmpy.models import BayesianModel
titanic = BayesianModel()
titanic.add_edges_from([("Sex", "Survived"), ("Pclass", "Survived")])
titanic.fit(dtrain)
for cpd in titanic.get_cpds():
    print(cpd)


print(dtest[["Sex", "Pclass"]])
titanic.predict(dtest[["Sex", "Pclass"]])
class TestBayesianModelCPD(unittest.TestCase):
    def setUp(self):
        self.G = BayesianModel([('d', 'g'), ('i', 'g'), ('g', 'l'),
                                ('i', 's')])

    def test_active_trail_nodes(self):
        self.assertEqual(sorted(self.G.active_trail_nodes('d')), ['d', 'g', 'l'])
        self.assertEqual(sorted(self.G.active_trail_nodes('i')), ['g', 'i', 'l', 's'])

    def test_active_trail_nodes_args(self):
        self.assertEqual(sorted(self.G.active_trail_nodes('d', observed='g')), ['d', 'i', 's'])
        self.assertEqual(sorted(self.G.active_trail_nodes('l', observed='g')), ['l'])
        self.assertEqual(sorted(self.G.active_trail_nodes('s', observed=['i', 'l'])), ['s'])
        self.assertEqual(sorted(self.G.active_trail_nodes('s', observed=['d', 'l'])), ['g', 'i', 's'])

    def test_is_active_trail_triplets(self):
        self.assertTrue(self.G.is_active_trail('d', 'l'))
        self.assertTrue(self.G.is_active_trail('g', 's'))
        self.assertFalse(self.G.is_active_trail('d', 'i'))
        self.assertTrue(self.G.is_active_trail('d', 'i', observed='g'))
        self.assertFalse(self.G.is_active_trail('d', 'l', observed='g'))
        self.assertFalse(self.G.is_active_trail('i', 'l', observed='g'))
        self.assertTrue(self.G.is_active_trail('d', 'i', observed='l'))
        self.assertFalse(self.G.is_active_trail('g', 's', observed='i'))

    def test_is_active_trail(self):
        self.assertFalse(self.G.is_active_trail('d', 's'))
        self.assertTrue(self.G.is_active_trail('s', 'l'))
        self.assertTrue(self.G.is_active_trail('d', 's', observed='g'))
        self.assertFalse(self.G.is_active_trail('s', 'l', observed='g'))

    def test_is_active_trail_args(self):
        self.assertFalse(self.G.is_active_trail('s', 'l', 'i'))
        self.assertFalse(self.G.is_active_trail('s', 'l', 'g'))
        self.assertTrue(self.G.is_active_trail('d', 's', 'l'))
        self.assertFalse(self.G.is_active_trail('d', 's', ['i', 'l']))

    def test_get_cpds(self):
        cpd_d = TabularCPD('d', 2, np.random.rand(2, 1))
        cpd_i = TabularCPD('i', 2, np.random.rand(2, 1))
        cpd_g = TabularCPD('g', 2, np.random.rand(2, 4), ['d', 'i'], [2, 2])
        cpd_l = TabularCPD('l', 2, np.random.rand(2, 2), ['g'], 2)
        cpd_s = TabularCPD('s', 2, np.random.rand(2, 2), ['i'], 2)
        self.G.add_cpds(cpd_d, cpd_i, cpd_g, cpd_l, cpd_s)

        self.assertEqual(self.G.get_cpds('d').variable, 'd')

    def test_get_cpds1(self):
        self.model = BayesianModel([('A', 'AB')])
        cpd_a = TabularCPD('A', 2, np.random.rand(2, 1))
        cpd_ab = TabularCPD('AB', 2, np.random.rand(2, 2), evidence=['A'],
                            evidence_card=[2])

        self.model.add_cpds(cpd_a, cpd_ab)
        self.assertEqual(self.model.get_cpds('A').variable, 'A')
        self.assertEqual(self.model.get_cpds('AB').variable, 'AB')

    def test_add_single_cpd(self):
        cpd_s = TabularCPD('s', 2, np.random.rand(2, 2), ['i'], 2)
        self.G.add_cpds(cpd_s)
        self.assertListEqual(self.G.get_cpds(), [cpd_s])

    def test_add_multiple_cpds(self):
        cpd_d = TabularCPD('d', 2, np.random.rand(2, 1))
        cpd_i = TabularCPD('i', 2, np.random.rand(2, 1))
        cpd_g = TabularCPD('g', 2, np.random.rand(2, 4), ['d', 'i'], [2, 2])
        cpd_l = TabularCPD('l', 2, np.random.rand(2, 2), ['g'], 2)
        cpd_s = TabularCPD('s', 2, np.random.rand(2, 2), ['i'], 2)

        self.G.add_cpds(cpd_d, cpd_i, cpd_g, cpd_l, cpd_s)
        self.assertEqual(self.G.get_cpds('d'), cpd_d)
        self.assertEqual(self.G.get_cpds('i'), cpd_i)
        self.assertEqual(self.G.get_cpds('g'), cpd_g)
        self.assertEqual(self.G.get_cpds('l'), cpd_l)
        self.assertEqual(self.G.get_cpds('s'), cpd_s)

    def test_check_model(self):
        cpd_g = TabularCPD('g', 2, 
                            np.array([[0.2, 0.3, 0.4, 0.6],
                                      [0.8, 0.7, 0.6, 0.4]]),
                                                            ['d', 'i'], [2, 2])

        cpd_s = TabularCPD('s', 2, 
                            np.array([[0.2, 0.3],
                                      [0.8, 0.7]]),
                                                ['i'], 2)

        cpd_l = TabularCPD('l', 2, 
                            np.array([[0.2, 0.3],
                                      [0.8, 0.7]]),
                                                ['g'], 2)

        self.G.add_cpds(cpd_g, cpd_s, cpd_l)
        self.assertTrue(self.G.check_model())


    def test_check_model1(self):
        cpd_g = TabularCPD('g', 2, 
                            np.array([[0.2, 0.3],
                                      [0.8, 0.7]]),
                                                 ['i'], 2)
        self.G.add_cpds(cpd_g)
        self.assertRaises(ValueError, self.G.check_model)
        self.G.remove_cpds(cpd_g)

        cpd_g = TabularCPD('g', 2, 
                            np.array([[0.2, 0.3, 0.4, 0.6],
                                      [0.8, 0.7, 0.6, 0.4]]),
                                                            ['d', 's'], [2, 2])
        self.G.add_cpds(cpd_g)
        self.assertRaises(ValueError, self.G.check_model)
        self.G.remove_cpds(cpd_g)

        cpd_g = TabularCPD('g', 2, 
                            np.array([[0.2, 0.3],
                                      [0.8, 0.7]]),
                                                 ['l'], 2)
        self.G.add_cpds(cpd_g)
        self.assertRaises(ValueError, self.G.check_model)
        self.G.remove_cpds(cpd_g)

        cpd_l = TabularCPD('l', 2, 
                            np.array([[0.2, 0.3],
                                      [0.8, 0.7]]),
                                                 ['d'], 2)
        self.G.add_cpds(cpd_l)
        self.assertRaises(ValueError, self.G.check_model)
        self.G.remove_cpds(cpd_l)

        cpd_l = TabularCPD('l', 2, 
                            np.array([[0.2, 0.3, 0.4, 0.6],
                                      [0.8, 0.7, 0.6, 0.4]]),
                                                           ['d', 'i'], [2, 2])
        self.G.add_cpds(cpd_l)
        self.assertRaises(ValueError, self.G.check_model)
        self.G.remove_cpds(cpd_l)

        cpd_l = TabularCPD('l', 2, 
                            np.array([[0.2, 0.3, 0.4, 0.6, 0.2, 0.3, 0.4, 0.6],
                                      [0.8, 0.7, 0.6, 0.4, 0.8, 0.7, 0.6, 0.4]]),
                                                            ['g', 'd', 'i'], [2, 2, 2])
        self.G.add_cpds(cpd_l)
        self.assertRaises(ValueError, self.G.check_model)
        self.G.remove_cpds(cpd_l)

    def test_check_model2(self):
        cpd_s = TabularCPD('s', 2, 
                            np.array([[0.5, 0.3],
                                      [0.8, 0.7]]),
                                                ['i'], 2)
        self.G.add_cpds(cpd_s)
        self.assertRaises(ValueError, self.G.check_model)
        self.G.remove_cpds(cpd_s)


        cpd_g = TabularCPD('g', 2, 
                            np.array([[0.2, 0.3, 0.4, 0.6],
                                      [0.3, 0.7, 0.6, 0.4]]),
                                                            ['d', 'i'], [2, 2])
        self.G.add_cpds(cpd_g)
        self.assertRaises(ValueError, self.G.check_model)
        self.G.remove_cpds(cpd_g)

        cpd_l = TabularCPD('l', 2, 
                            np.array([[0.2, 0.3],
                                      [0.1, 0.7]]),
                                                ['g'], 2)
        self.G.add_cpds(cpd_l)
        self.assertRaises(ValueError, self.G.check_model)
        self.G.remove_cpds(cpd_l)


    def tearDown(self):
        del self.G
import numpy as np
import pandas as pd
from pgmpy.models import BayesianModel
from pgmpy.estimators import BayesianEstimator
# Generating random data for two coin tossing examples
raw_data = np.random.randint(low=0, high=2, size=(1000, 2))
data = pd.DataFrame(raw_data, columns=['X', 'Y'])
print(data)
coin_model = BayesianModel()
coin_model.fit(data, estimator=BayesianEstimator)
coin_model.get_cpds()
coin_model.nodes()
coin_model.edges()
# Now in general machine learning problems it doesn't matter which
# column of the array represents which variable (until we use same
# order for both training and prediction) because all the values
# are on symmetrical axis but in graphical models each variable is
# different (in the way it is connected to other variables etc) so
# we will need to specify which columns of data are for which
# variable. For that we will use pandas.
import pandas as pd
data = pd.DataFrame(data, columns=['cost', 'quality',
                                   'location', 'no_of_people'])
data
train = data[:750]
# We will try to predict the no_of_people from our model. So for
# test data we will delete that column and then later on predict
# those values.
test = data[750:].drop('no_of_people', axis=1)
test
# Now we will need to create the base network structure for the
# model.
restaurant_model = BayesianModel([('location', 'cost'),
                                  ('quality', 'cost'),
                                  ('location', 'no_of_people'),
                                  ('cost', 'no_of_people')])
restaurant_model.fit(train)
# Fit computes the cpd of all the variables from the training data
# that we provided.
restaurant_model.get_cpds()
# Now for predicting the values of no_of_people using this model
# we can simply call the predict method on our test data.
restaurant_model.predict(test).values.ravel()