def test_class_init_with_data_nonstring(self):
        self.g = NaiveBayes([(1, 2), (1, 3)])
        six.assertCountEqual(self, self.g.nodes(), [1, 2, 3])
        six.assertCountEqual(self, self.g.edges(), [(1, 2), (1, 3)])
        self.assertEqual(self.g.parent_node, 1)
        self.assertSetEqual(self.g.children_nodes, {2, 3})

        self.assertRaises(ValueError, NaiveBayes, [(1, 2), (2, 3)])
        self.assertRaises(ValueError, NaiveBayes, [(1, 2), (3, 2)])
        self.assertRaises(ValueError, NaiveBayes, [(1, 2), (3, 4)])
    def test_class_init_with_data_string(self):
        self.g = NaiveBayes([('a', 'b'), ('a', 'c')])
        six.assertCountEqual(self, self.g.nodes(), ['a', 'b', 'c'])
        six.assertCountEqual(self, self.g.edges(), [('a', 'b'), ('a', 'c')])
        self.assertEqual(self.g.parent_node, 'a')
        self.assertSetEqual(self.g.children_nodes, {'b', 'c'})

        self.assertRaises(ValueError, NaiveBayes, [('a', 'b'), ('b', 'c')])
        self.assertRaises(ValueError, NaiveBayes, [('a', 'b'), ('c', 'b')])
        self.assertRaises(ValueError, NaiveBayes, [('a', 'b'), ('d', 'e')])
    def test_class_init_with_data_string(self):
        self.g = NaiveBayes([("a", "b"), ("a", "c")])
        six.assertCountEqual(self, list(self.g.nodes()), ["a", "b", "c"])
        six.assertCountEqual(self, list(self.g.edges()), [("a", "b"), ("a", "c")])
        self.assertEqual(self.g.parent_node, "a")
        self.assertSetEqual(self.g.children_nodes, {"b", "c"})

        self.assertRaises(ValueError, NaiveBayes, [("a", "b"), ("b", "c")])
        self.assertRaises(ValueError, NaiveBayes, [("a", "b"), ("c", "b")])
        self.assertRaises(ValueError, NaiveBayes, [("a", "b"), ("d", "e")])
class TestNaiveBayesMethods(unittest.TestCase):
    def setUp(self):
        self.G1 = NaiveBayes([('a', 'b'), ('a', 'c'),
                             ('a', 'd'), ('a', 'e')])
        self.G2 = NaiveBayes([('d', 'g'), ('d', 'l'),
                              ('d', 's')])

    def test_local_independencies(self):
        self.assertEqual(self.G1.local_independencies('a'), Independencies())
        self.assertEqual(self.G1.local_independencies('b'), Independencies(['b', ['e', 'c', 'd'], 'a']))
        self.assertEqual(self.G1.local_independencies('c'), Independencies(['c', ['e', 'b', 'd'], 'a']))
        self.assertEqual(self.G1.local_independencies('d'), Independencies(['d', ['b', 'c', 'e'], 'a']))

    def test_active_trail_nodes(self):
        self.assertListEqual(sorted(self.G2.active_trail_nodes('d')), ['d', 'g', 'l', 's'])
        self.assertListEqual(sorted(self.G2.active_trail_nodes('g')), ['d', 'g', 'l', 's'])
        self.assertListEqual(sorted(self.G2.active_trail_nodes('l')), ['d', 'g', 'l', 's'])
        self.assertListEqual(sorted(self.G2.active_trail_nodes('s')), ['d', 'g', 'l', 's'])

    def test_active_trail_nodes_args(self):
        self.assertListEqual(sorted(self.G2.active_trail_nodes('d', observed='g')), ['d', 'l', 's'])
        self.assertListEqual(sorted(self.G2.active_trail_nodes('l', observed='g')), ['d', 'l', 's'])
        self.assertListEqual(sorted(self.G2.active_trail_nodes('s', observed=['g', 'l'])), ['d', 's'])
        self.assertListEqual(sorted(self.G2.active_trail_nodes('s', observed=['d', 'l'])), ['s'])

    def test_get_ancestors_of(self):
        self.assertListEqual(sorted(self.G1._get_ancestors_of('b')), ['a', 'b'])
        self.assertListEqual(sorted(self.G1._get_ancestors_of('e')), ['a', 'e'])
        self.assertListEqual(sorted(self.G1._get_ancestors_of('a')), ['a'])
        self.assertListEqual(sorted(self.G1._get_ancestors_of(['b', 'e'])), ['a', 'b', 'e'])

    def tearDown(self):
        del self.G1
        del self.G2
 def reset(self) -> PGMNaiveBayes:
     '''Totally reset the Classifier'''
     self.categories = {}
     self.tokens = {}
     self.cardinality = 1
     self.total_documents = 0
     self.total_tokens = 0
     self.cpd_class = None
     self.model = NaiveBayes()
     self.model.add_node(Data.CATEGORY_NAME)
     return self
Beispiel #6
0
def Bayesian_Net_Model(data):
    cols = data.columns.values
    n_cols = len(data.columns.values)

    BN_Model = NaiveBayes()
    BN_Model.add_nodes_from(cols)

    edges = []
    for i in cols:
        if (i != "Overall"):
            edge = ["Overall", i]
            edges.append(edge)

    BN_Model.add_edges_from(edges)

    print("Aggiunti Archi e Nodi \n\n")

    data_cpts = Compute_CPT(data, "Overall")
    CPTS_list = Generate_CPTs(data_cpts, data, cols, 'Overall')
    test_list = [None] * len(CPTS_list)

    for i in CPTS_list:
        BN_Model.add_cpds(i)

    print("Aggiunte CPD \n\n")

    return BN_Model
Beispiel #7
0
    def test_class_init_with_data_string(self):
        self.g = NaiveBayes([('a', 'b'), ('a', 'c')])
        six.assertCountEqual(self, self.g.nodes(), ['a', 'b', 'c'])
        six.assertCountEqual(self, self.g.edges(), [('a', 'b'), ('a', 'c')])
        self.assertEqual(self.g.parent_node, 'a')
        self.assertSetEqual(self.g.children_nodes, {'b', 'c'})

        self.assertRaises(ValueError, NaiveBayes, [('a', 'b'), ('b', 'c')])
        self.assertRaises(ValueError, NaiveBayes, [('a', 'b'), ('c', 'b')])
        self.assertRaises(ValueError, NaiveBayes, [('a', 'b'), ('d', 'e')])
Beispiel #8
0
    def test_class_init_with_data_nonstring(self):
        self.g = NaiveBayes([(1, 2), (1, 3)])
        six.assertCountEqual(self, self.g.nodes(), [1, 2, 3])
        six.assertCountEqual(self, self.g.edges(), [(1, 2), (1, 3)])
        self.assertEqual(self.g.parent_node, 1)
        self.assertSetEqual(self.g.children_nodes, {2, 3})

        self.assertRaises(ValueError, NaiveBayes, [(1, 2), (2, 3)])
        self.assertRaises(ValueError, NaiveBayes, [(1, 2), (3, 2)])
        self.assertRaises(ValueError, NaiveBayes, [(1, 2), (3, 4)])
class TestNaiveBayesFit(unittest.TestCase):
    def setUp(self):
        self.model1 = NaiveBayes()
        self.model2 = NaiveBayes([("A", "B")])

    def test_fit_model_creation(self):
        values = pd.DataFrame(
            np.random.randint(low=0, high=2, size=(1000, 5)),
            columns=["A", "B", "C", "D", "E"],
        )

        self.model1.fit(values, "A")
        six.assertCountEqual(self, self.model1.nodes(), ["A", "B", "C", "D", "E"])
        six.assertCountEqual(
            self, self.model1.edges(), [("A", "B"), ("A", "C"), ("A", "D"), ("A", "E")]
        )
        self.assertEqual(self.model1.parent_node, "A")
        self.assertSetEqual(self.model1.children_nodes, {"B", "C", "D", "E"})

        self.model2.fit(values)
        six.assertCountEqual(self, self.model1.nodes(), ["A", "B", "C", "D", "E"])
        six.assertCountEqual(
            self, self.model1.edges(), [("A", "B"), ("A", "C"), ("A", "D"), ("A", "E")]
        )
        self.assertEqual(self.model2.parent_node, "A")
        self.assertSetEqual(self.model2.children_nodes, {"B", "C", "D", "E"})

    def test_fit_model_creation_exception(self):
        values = pd.DataFrame(
            np.random.randint(low=0, high=2, size=(1000, 5)),
            columns=["A", "B", "C", "D", "E"],
        )
        values2 = pd.DataFrame(
            np.random.randint(low=0, high=2, size=(1000, 3)), columns=["C", "D", "E"]
        )

        self.assertRaises(ValueError, self.model1.fit, values)
        self.assertRaises(ValueError, self.model1.fit, values2)
        self.assertRaises(ValueError, self.model2.fit, values2, "A")

    def tearDown(self):
        del self.model1
        del self.model2
class TestNaiveBayesFit(unittest.TestCase):
    def setUp(self):
        self.model1 = NaiveBayes()
        self.model2 = NaiveBayes([('A', 'B')])

    def test_fit_model_creation(self):
        values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 5)),
                              columns=['A', 'B', 'C', 'D', 'E'])

        self.model1.fit(values, 'A')
        six.assertCountEqual(self, self.model1.nodes(),
                             ['A', 'B', 'C', 'D', 'E'])
        six.assertCountEqual(self, self.model1.edges(),
                             [('A', 'B'), ('A', 'C'), ('A', 'D'), ('A', 'E')])
        self.assertEqual(self.model1.parent_node, 'A')
        self.assertSetEqual(self.model1.children_nodes, {'B', 'C', 'D', 'E'})

        self.model2.fit(values)
        six.assertCountEqual(self, self.model1.nodes(),
                             ['A', 'B', 'C', 'D', 'E'])
        six.assertCountEqual(self, self.model1.edges(),
                             [('A', 'B'), ('A', 'C'), ('A', 'D'), ('A', 'E')])
        self.assertEqual(self.model2.parent_node, 'A')
        self.assertSetEqual(self.model2.children_nodes, {'B', 'C', 'D', 'E'})

    def test_fit_model_creation_exception(self):
        values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 5)),
                              columns=['A', 'B', 'C', 'D', 'E'])
        values2 = pd.DataFrame(np.random.randint(low=0, high=2,
                                                 size=(1000, 3)),
                               columns=['C', 'D', 'E'])

        self.assertRaises(ValueError, self.model1.fit, values)
        self.assertRaises(ValueError, self.model1.fit, values2)
        self.assertRaises(ValueError, self.model2.fit, values2, 'A')

    def tearDown(self):
        del self.model1
        del self.model2
Beispiel #11
0
class TestNaiveBayesFit(unittest.TestCase):
    def setUp(self):
        self.model1 = NaiveBayes()
        self.model2 = NaiveBayes([('A','B')])

    def test_fit_model_creation(self):
        values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 5)),
                                            columns=['A', 'B', 'C', 'D', 'E'])
        
        self.model1.fit(values, 'A')
        six.assertCountEqual(self, self.model1.nodes(), ['A', 'B', 'C', 'D', 'E'])
        six.assertCountEqual(self, self.model1.edges(), [('A', 'B'), ('A', 'C'), ('A', 'D'),
                                                    ('A', 'E')])
        self.assertEqual(self.model1.parent_node, 'A')
        self.assertSetEqual(self.model1.children_nodes, {'B','C','D','E'})

        self.model2.fit(values)
        six.assertCountEqual(self, self.model1.nodes(), ['A', 'B', 'C', 'D', 'E'])
        six.assertCountEqual(self, self.model1.edges(), [('A', 'B'), ('A', 'C'), ('A', 'D'),
                                                    ('A', 'E')])
        self.assertEqual(self.model2.parent_node, 'A')
        self.assertSetEqual(self.model2.children_nodes, {'B','C','D','E'})

    def test_fit_model_creation_exception(self):
        values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 5)),
                                            columns=['A', 'B', 'C', 'D', 'E'])
        values2 = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 3)),
                                            columns=['C', 'D', 'E'])

        self.assertRaises(ValueError, self.model1.fit, values)
        self.assertRaises(ValueError, self.model1.fit, values2)
        self.assertRaises(ValueError, self.model2.fit, values2, 'A')

    def tearDown(self):
        del self.model1
        del self.model2
Beispiel #12
0
 def setUp(self):
     self.model1 = NaiveBayes()
     self.model2 = NaiveBayes([('A','B')])
Beispiel #13
0
 def setUp(self):
     self.G = NaiveBayes()
 def test_update_node_parents_bm_constructor(self):
     self.g = NaiveBayes([('a', 'b'), ('a', 'c')])
     self.assertListEqual(self.g.predecessors('a'), [])
     self.assertListEqual(self.g.predecessors('b'), ['a'])
     self.assertListEqual(self.g.predecessors('c'), ['a'])
Beispiel #15
0
class TestBaseModelCreation(unittest.TestCase):
    def setUp(self):
        self.G = NaiveBayes()

    def test_class_init_without_data(self):
        self.assertIsInstance(self.G, nx.DiGraph)

    def test_class_init_with_data_string(self):
        self.g = NaiveBayes([('a', 'b'), ('a', 'c')])
        six.assertCountEqual(self, self.g.nodes(), ['a', 'b', 'c'])
        six.assertCountEqual(self, self.g.edges(), [('a', 'b'), ('a', 'c')])
        self.assertEqual(self.g.parent_node, 'a')
        self.assertSetEqual(self.g.children_nodes, {'b', 'c'})

        self.assertRaises(ValueError, NaiveBayes, [('a', 'b'), ('b', 'c')])
        self.assertRaises(ValueError, NaiveBayes, [('a', 'b'), ('c', 'b')])
        self.assertRaises(ValueError, NaiveBayes, [('a', 'b'), ('d', 'e')])

    def test_class_init_with_data_nonstring(self):
        self.g = NaiveBayes([(1, 2), (1, 3)])
        six.assertCountEqual(self, self.g.nodes(), [1, 2, 3])
        six.assertCountEqual(self, self.g.edges(), [(1, 2), (1, 3)])
        self.assertEqual(self.g.parent_node, 1)
        self.assertSetEqual(self.g.children_nodes, {2, 3})

        self.assertRaises(ValueError, NaiveBayes, [(1, 2), (2, 3)])
        self.assertRaises(ValueError, NaiveBayes, [(1, 2), (3, 2)])
        self.assertRaises(ValueError, NaiveBayes, [(1, 2), (3, 4)])

    def test_add_node_string(self):
        self.G.add_node('a')
        self.assertListEqual(self.G.nodes(), ['a'])

    def test_add_node_nonstring(self):
        self.G.add_node(1)
        self.assertListEqual(self.G.nodes(), [1])

    def test_add_nodes_from_string(self):
        self.G.add_nodes_from(['a', 'b', 'c', 'd'])
        six.assertCountEqual(self, self.G.nodes(), ['a', 'b', 'c', 'd'])

    def test_add_nodes_from_non_string(self):
        self.G.add_nodes_from([1, 2, 3, 4])
        six.assertCountEqual(self, self.G.nodes(), [1, 2, 3, 4])

    def test_add_edge_string(self):
        self.G.add_edge('a', 'b')
        six.assertCountEqual(self, self.G.nodes(), ['a', 'b'])
        self.assertListEqual(self.G.edges(), [('a', 'b')])
        self.assertEqual(self.G.parent_node, 'a')
        self.assertSetEqual(self.G.children_nodes, {'b'})

        self.G.add_nodes_from(['c', 'd'])
        self.G.add_edge('a', 'c')
        self.G.add_edge('a', 'd')
        six.assertCountEqual(self, self.G.nodes(), ['a', 'b', 'c', 'd'])
        six.assertCountEqual(self, self.G.edges(), [('a', 'b'), ('a', 'c'), ('a', 'd')])
        self.assertEqual(self.G.parent_node, 'a')
        self.assertSetEqual(self.G.children_nodes, {'b', 'c', 'd'})

        self.assertRaises(ValueError, self.G.add_edge, 'b', 'c')
        self.assertRaises(ValueError, self.G.add_edge, 'd', 'f')
        self.assertRaises(ValueError, self.G.add_edge, 'e', 'f')
        self.assertRaises(ValueError, self.G.add_edges_from, [('a', 'e'), ('b', 'f')])
        self.assertRaises(ValueError, self.G.add_edges_from, [('b', 'f')])

    def test_add_edge_nonstring(self):
        self.G.add_edge(1, 2)
        six.assertCountEqual(self, self.G.nodes(), [1, 2])
        self.assertListEqual(self.G.edges(), [(1, 2)])
        self.assertEqual(self.G.parent_node, 1)
        self.assertSetEqual(self.G.children_nodes, {2})

        self.G.add_nodes_from([3, 4])
        self.G.add_edge(1, 3)
        self.G.add_edge(1, 4)
        six.assertCountEqual(self, self.G.nodes(), [1, 2, 3, 4])
        six.assertCountEqual(self, self.G.edges(), [(1, 2), (1, 3), (1, 4)])
        self.assertEqual(self.G.parent_node, 1)
        self.assertSetEqual(self.G.children_nodes, {2, 3, 4})

        self.assertRaises(ValueError, self.G.add_edge, 2, 3)
        self.assertRaises(ValueError, self.G.add_edge, 3, 6)
        self.assertRaises(ValueError, self.G.add_edge, 5, 6)
        self.assertRaises(ValueError, self.G.add_edges_from, [(1, 5), (2, 6)])
        self.assertRaises(ValueError, self.G.add_edges_from, [(2, 6)])

    def test_add_edge_selfloop(self):
        self.assertRaises(ValueError, self.G.add_edge, 'a', 'a')
        self.assertRaises(ValueError, self.G.add_edge, 1, 1)

    def test_add_edges_from_self_loop(self):
        self.assertRaises(ValueError, self.G.add_edges_from,
                          [('a', 'a')])

    def test_update_node_parents_bm_constructor(self):
        self.g = NaiveBayes([('a', 'b'), ('a', 'c')])
        self.assertListEqual(self.g.predecessors('a'), [])
        self.assertListEqual(self.g.predecessors('b'), ['a'])
        self.assertListEqual(self.g.predecessors('c'), ['a'])

    def test_update_node_parents(self):
        self.G.add_nodes_from(['a', 'b', 'c'])
        self.G.add_edges_from([('a', 'b'), ('a', 'c')])
        self.assertListEqual(self.G.predecessors('a'), [])
        self.assertListEqual(self.G.predecessors('b'), ['a'])
        self.assertListEqual(self.G.predecessors('c'), ['a'])

    def tearDown(self):
        del self.G
class PGMNaiveBayes(TextClassifier):
    def __add_category(
        self, categories: Union[str, List[str], List[(str, str)], Dict[str,
                                                                       str]]
    ) -> PGMNaiveBayes:
        '''setup the bayes network with a new category entry'''
        if type(categories) is str: categories = [categories]
        if type(categories) is dict: categories = categories.items()
        to_create = False
        for category in categories:
            if type(category) is str: category = (category, category)
            category, index = category
            if category not in self.categories:
                self.categories[category] = index
                self.cardinality = len(self.categories) or 1
                to_create = True
        if to_create: self.__create_class_cpd()
        return self

    def __add_token(self, tokens: Union[str, List[str]]) -> PGMNaiveBayes:
        if type(tokens) is str: tokens = [tokens]
        to_create = []
        for token in tokens:
            if token not in self.tokens:
                to_create.append(token)
                self.total_tokens += 1
        self.__create_word_cpd(to_create)
        return self

    def __create_word_cpd(self,
                          tokens: Union[str, List[str]],
                          check: bool = True) -> PGMNaiveBayes:
        '''Generate the table for the given token node'''
        if type(tokens) is str: tokens = [tokens]
        cpds = []
        for token in tokens:
            if token in self.tokens:
                self.model.remove_cpds(self.tokens[token])
            cpd_word = TabularCPD(
                variable=token,
                variable_card=2,
                evidence=[Data.CATEGORY_NAME],
                evidence_card=[self.cardinality],
                values=[[0.5 for _ in range(self.cardinality)]] * 2)
            self.tokens[token] = cpd_word
            cpds.append(cpd_word)

        self.model.add_nodes_from(tokens)
        self.model.add_edges_from([(Data.CATEGORY_NAME, token)
                                   for token in tokens])
        self.model.add_cpds(*cpds)
        # if check: self.model.check_model()
        return self

    def __create_class_cpd(self, check: bool = True) -> PGMNaiveBayes:
        '''Generate the table for the category node'''
        if self.cpd_class:
            self.model.remove_cpds(self.cpd_class)
        self.cpd_class = TabularCPD(variable=Data.CATEGORY_NAME,
                                    variable_card=self.cardinality,
                                    values=[[1 / self.cardinality]
                                            for _ in range(self.cardinality)])
        self.model.add_cpds(self.cpd_class)
        # if check: self.model.check_model()
        return self

    def __cpd_to_json(self, cpd: TabularCPD) -> Dict:
        return {
            'variable': cpd.variable,
            'variables': cpd.variables,
            'variable_card': cpd.variable_card.tolist(),
            'values': cpd.values.tolist()
        }

    def __cpd_from_json(self, cpd: Dict) -> TabularCPD:
        return TabularCPD(**cpd)

    def reset(self) -> PGMNaiveBayes:
        '''Totally reset the Classifier'''
        self.categories = {}
        self.tokens = {}
        self.cardinality = 1
        self.total_documents = 0
        self.total_tokens = 0
        self.cpd_class = None
        self.model = NaiveBayes()
        self.model.add_node(Data.CATEGORY_NAME)
        return self

    def token_probability(self, token: str, category: str) -> float:
        '''return the probability of a given token to belong a given category'''
        probability = self.model.predict_probability(
            pd.DataFrame([[1]], columns=[token]))
        column = '{}_{}'.format(Data.CATEGORY_NAME,
                                self.categories.get(category, 0))
        return probability[column][0] if column in probability else 0

    def category_probability(self, category: str) -> float:
        '''return the probability of the given category'''
        score = Data.CATEGORY_VALUES.get(category, 0)
        elimination = VariableElimination(self.model)
        probability = elimination.query(variables=[Data.CATEGORY_NAME])
        state = probability.get_state_no(Data.CATEGORY_NAME,
                                         self.categories.get(category, 0))
        return probability.values[state]

    def word_probability(self, text: str) -> pd.DataFrame:
        '''retrive the probability table of the given text without knowing the probability of the category (no evidence): P(C | w1,...,wn)'''
        data = Data(text)
        elimination = VariableElimination(self.model)
        values = [[] for _ in range(self.cardinality)]
        for token in data.tokens:
            if token not in self.tokens:
                for v in values:
                    v.append(1 / (self.cardinality or 1))
            else:
                probability = elimination.query(variables=[Data.CATEGORY_NAME],
                                                evidence={
                                                    token: 1
                                                }).values
                for i in range(len(probability)):
                    values[i].append(probability[i])

        return pd.DataFrame(np.array(values).T,
                            columns=list(self.categories),
                            index=data.tokens)

    def probability(self, text: str) -> pd.DataFrame:
        '''retrive the probability table of the given text knowing the probability of categories: P(C) * P(C | w1,...,wn)'''
        data = Data(text)
        values = pd.DataFrame(
            [[1 if t in data.table else 0 for t in self.tokens]],
            columns=self.tokens)
        probabilities = self.model.predict_probability(values)
        return probabilities.rename(
            columns={
                '{}_{}'.format(Data.CATEGORY_NAME, v): k
                for k, v in self.categories.items()
            })

    def fit(self,
            text: Union[str, Iterable[str], Iterable[Data], pd.DataFrame],
            category: Union[str, Iterable[str]] = None) -> TextClassifier:
        '''learn probabilities for tokens extracted by the given text'''
        data = DataSet.FromAny(text, category)

        categories = []
        tokens = {}
        values = []

        for d in data:
            categories.append((d.category, d.score))
            for token in d.tokens:
                tokens[token] = 1
            values.append((d.table, d.score))
            self.total_documents += 1

        tokens = list(tokens)
        self.__add_category(categories)
        self.__add_token(tokens)

        data_values = [[1 if t in v[0] else 0 for t in tokens] + [v[1]]
                       for v in values]

        tokens.append(Data.CATEGORY_NAME)

        data_values = pd.DataFrame(data_values, columns=tokens)

        self.model.fit(data_values, Data.CATEGORY_NAME)

        return self

    def words(self, categories: Union[str, Iterable[str]]) -> pd.DataFrame:
        '''return a sorted by probability table with tokens as rows and categories as columns, for the given categories'''
        elimination = VariableElimination(self.model)
        values = [[] for _ in range(self.cardinality)]
        for token in self.tokens:
            probability = elimination.query(variables=[Data.CATEGORY_NAME],
                                            evidence={
                                                token: 1
                                            }).values
            for i in range(len(probability)):
                values[i].append(probability[i])

        return pd.DataFrame(np.array(values).T,
                            columns=list(self.categories),
                            index=list(self.tokens))

    def to_json(self) -> Dict:
        return {
            'categories': self.categories,
            'total_documents': self.total_documents,
            'tokens': {
                c.variable: c.values.tolist()
                for c in self.model.get_cpds()
                if c.variable != Data.CATEGORY_NAME
            },
        }

    def from_json(self, data: Dict) -> PGMNaiveBayes:
        self.total_documents = data.get('total_documents',
                                        self.total_documents)
        self.__add_category(data.get('categories', {}))
        self.model.remove_cpds(self.cpd_class)
        self.cpd_class = TabularCPD(
            **data.get('class')) if 'class' in data else self.cpd_class
        self.model.add_cpds(self.cpd_class)
        tokens = data.get('tokens', {})
        self.__add_token(list(tokens))
        cpds = {c.variable: c for c in self.model.get_cpds()}
        for token, values in tokens.items():
            if token in cpds:
                cpds[token].values = np.array(values)[0:self.cardinality,
                                                      0:self.cardinality]

        self.model.check_model()
        return self

    def __str__(self) -> str:
        return 'NaiveBayes<{}, {}>[{}]'.format(self.total_documents,
                                               self.total_tokens,
                                               str.join(', ', self.categories))

    def __repr__(self) -> str:
        return str(self)
Beispiel #17
0
Datei: NB.py Projekt: laugek/PGM
# Print an example of 1 instance in the dataset
print("\nAn example of a person")
print(df.iloc[0])

# Split the data to test and train
test_size = 0.33
print("\nSplitting in to training and test data using: Test size = ", test_size)
data_train, data_test = train_test_split(df, test_size=test_size)
print("training data:", len(data_train))
print("test data:", len(data_test))


#################################################################################
##### Defining the model
#################################################################################
model = NaiveBayes()

# Learning CPDs using Maximum Likelihood Estimators
model.fit(data_train, 'class', estimator=MaximumLikelihoodEstimator)
# Print the CPDs learned
print("\n\n............Overview of our CPDs from the fit...........:")
for cpd in model.get_cpds():
    print("CPD of {variable}:".format(variable=cpd.variable))
    print(cpd)

print("\n\n............Overview of levels in variables...........:")
for col in df:
    print(col,":", len(df[col].unique()) )

#################################################################################
##### Using the model to query
 def setUp(self):
     self.model1 = NaiveBayes()
     self.model2 = NaiveBayes([('A', 'B')])
 def setUp(self):
     self.model1 = NaiveBayes()
     self.model2 = NaiveBayes([("A", "B")])
 def setUp(self):
     self.G1 = NaiveBayes([('a', 'b'), ('a', 'c'), ('a', 'd'), ('a', 'e')])
     self.G2 = NaiveBayes([('d', 'g'), ('d', 'l'), ('d', 's')])
Beispiel #21
0
        CPD = pickle.load(fp)
    with open("RandomColumns.txt", "rb") as fp:
        random_columns = pickle.load(fp)
    with open("RandomIndices.txt", "rb") as fp:
        random_indices = pickle.load(fp)
    data = data.iloc[:, random_columns]
    column_size = data.shape[1]
    #Delete invoices with all zeros from the data
    data = data[(data.T != 0).any()]
    row_size = data.shape[0]
    smallDF = data.iloc[random_indices, :]
    smallDF.shape
    DictOfModels = {}
    for productName in smallDF.columns:
        print('Collecting model for {0}'.format(productName))
        model = NaiveBayes()
        model.add_nodes_from(Nodes[productName])
        model.add_edges_from(Edges[productName])
        model.add_cpds(*CPD[productName])
        DictOfModels[productName] = model
        #Save edge ,node, CPD information
    PseudoCounts = {}
    #Pseudocounts are given (1,1) for uniform
    for productName in smallDF.columns:
        PseudoCounts[productName] = [1, 1]
except:
    print('Existing model not found')
    #Select random invoice (2000) and products (50)
    seed(0)
    column_size = data.shape[1]
    random_columns = sample(range(column_size), 100)
class TestNaiveBayesMethods(unittest.TestCase):
    def setUp(self):
        self.G1 = NaiveBayes([('a', 'b'), ('a', 'c'), ('a', 'd'), ('a', 'e')])
        self.G2 = NaiveBayes([('d', 'g'), ('d', 'l'), ('d', 's')])

    def test_local_independencies(self):
        self.assertListEqual(self.G1.local_independencies('a'), [None])
        self.assertListEqual(self.G1.local_independencies('b'),
                             [Independencies(['b', ['e', 'c', 'd'], 'a'])])
        self.assertListEqual(self.G1.local_independencies('c'),
                             [Independencies(['c', ['e', 'b', 'd'], 'a'])])
        self.assertListEqual(self.G1.local_independencies('d'),
                             [Independencies(['d', ['b', 'c', 'e'], 'a'])])

    def test_active_trail_nodes(self):
        self.assertListEqual(sorted(self.G2.active_trail_nodes('d')),
                             ['d', 'g', 'l', 's'])
        self.assertListEqual(sorted(self.G2.active_trail_nodes('g')),
                             ['d', 'g', 'l', 's'])
        self.assertListEqual(sorted(self.G2.active_trail_nodes('l')),
                             ['d', 'g', 'l', 's'])
        self.assertListEqual(sorted(self.G2.active_trail_nodes('s')),
                             ['d', 'g', 'l', 's'])

    def test_active_trail_nodes_args(self):
        self.assertListEqual(
            sorted(self.G2.active_trail_nodes('d', observed='g')),
            ['d', 'l', 's'])
        self.assertListEqual(
            sorted(self.G2.active_trail_nodes('l', observed='g')),
            ['d', 'l', 's'])
        self.assertListEqual(
            sorted(self.G2.active_trail_nodes('s', observed=['g', 'l'])),
            ['d', 's'])
        self.assertListEqual(
            sorted(self.G2.active_trail_nodes('s', observed=['d', 'l'])),
            ['s'])

    def tearDown(self):
        del self.G1
        del self.G2
 def setUp(self):
     self.G = NaiveBayes()
class TestBaseModelCreation(unittest.TestCase):
    def setUp(self):
        self.G = NaiveBayes()

    def test_class_init_without_data(self):
        self.assertIsInstance(self.G, nx.DiGraph)

    def test_class_init_with_data_string(self):
        self.g = NaiveBayes([('a', 'b'), ('a', 'c')])
        six.assertCountEqual(self, self.g.nodes(), ['a', 'b', 'c'])
        six.assertCountEqual(self, self.g.edges(), [('a', 'b'), ('a', 'c')])
        self.assertEqual(self.g.parent_node, 'a')
        self.assertSetEqual(self.g.children_nodes, {'b', 'c'})

        self.assertRaises(ValueError, NaiveBayes, [('a', 'b'), ('b', 'c')])
        self.assertRaises(ValueError, NaiveBayes, [('a', 'b'), ('c', 'b')])
        self.assertRaises(ValueError, NaiveBayes, [('a', 'b'), ('d', 'e')])

    def test_class_init_with_data_nonstring(self):
        self.g = NaiveBayes([(1, 2), (1, 3)])
        six.assertCountEqual(self, self.g.nodes(), [1, 2, 3])
        six.assertCountEqual(self, self.g.edges(), [(1, 2), (1, 3)])
        self.assertEqual(self.g.parent_node, 1)
        self.assertSetEqual(self.g.children_nodes, {2, 3})

        self.assertRaises(ValueError, NaiveBayes, [(1, 2), (2, 3)])
        self.assertRaises(ValueError, NaiveBayes, [(1, 2), (3, 2)])
        self.assertRaises(ValueError, NaiveBayes, [(1, 2), (3, 4)])

    def test_add_node_string(self):
        self.G.add_node('a')
        self.assertListEqual(self.G.nodes(), ['a'])

    def test_add_node_nonstring(self):
        self.G.add_node(1)
        self.assertListEqual(self.G.nodes(), [1])

    def test_add_nodes_from_string(self):
        self.G.add_nodes_from(['a', 'b', 'c', 'd'])
        six.assertCountEqual(self, self.G.nodes(), ['a', 'b', 'c', 'd'])

    def test_add_nodes_from_non_string(self):
        self.G.add_nodes_from([1, 2, 3, 4])
        six.assertCountEqual(self, self.G.nodes(), [1, 2, 3, 4])

    def test_add_edge_string(self):
        self.G.add_edge('a', 'b')
        six.assertCountEqual(self, self.G.nodes(), ['a', 'b'])
        self.assertListEqual(self.G.edges(), [('a', 'b')])
        self.assertEqual(self.G.parent_node, 'a')
        self.assertSetEqual(self.G.children_nodes, {'b'})

        self.G.add_nodes_from(['c', 'd'])
        self.G.add_edge('a', 'c')
        self.G.add_edge('a', 'd')
        six.assertCountEqual(self, self.G.nodes(), ['a', 'b', 'c', 'd'])
        six.assertCountEqual(self, self.G.edges(), [('a', 'b'), ('a', 'c'),
                                                    ('a', 'd')])
        self.assertEqual(self.G.parent_node, 'a')
        self.assertSetEqual(self.G.children_nodes, {'b', 'c', 'd'})

        self.assertRaises(ValueError, self.G.add_edge, 'b', 'c')
        self.assertRaises(ValueError, self.G.add_edge, 'd', 'f')
        self.assertRaises(ValueError, self.G.add_edge, 'e', 'f')
        self.assertRaises(ValueError, self.G.add_edges_from, [('a', 'e'),
                                                              ('b', 'f')])
        self.assertRaises(ValueError, self.G.add_edges_from, [('b', 'f')])

    def test_add_edge_nonstring(self):
        self.G.add_edge(1, 2)
        six.assertCountEqual(self, self.G.nodes(), [1, 2])
        self.assertListEqual(self.G.edges(), [(1, 2)])
        self.assertEqual(self.G.parent_node, 1)
        self.assertSetEqual(self.G.children_nodes, {2})

        self.G.add_nodes_from([3, 4])
        self.G.add_edge(1, 3)
        self.G.add_edge(1, 4)
        six.assertCountEqual(self, self.G.nodes(), [1, 2, 3, 4])
        six.assertCountEqual(self, self.G.edges(), [(1, 2), (1, 3), (1, 4)])
        self.assertEqual(self.G.parent_node, 1)
        self.assertSetEqual(self.G.children_nodes, {2, 3, 4})

        self.assertRaises(ValueError, self.G.add_edge, 2, 3)
        self.assertRaises(ValueError, self.G.add_edge, 3, 6)
        self.assertRaises(ValueError, self.G.add_edge, 5, 6)
        self.assertRaises(ValueError, self.G.add_edges_from, [(1, 5), (2, 6)])
        self.assertRaises(ValueError, self.G.add_edges_from, [(2, 6)])

    def test_add_edge_selfloop(self):
        self.assertRaises(ValueError, self.G.add_edge, 'a', 'a')
        self.assertRaises(ValueError, self.G.add_edge, 1, 1)

    def test_add_edges_from_self_loop(self):
        self.assertRaises(ValueError, self.G.add_edges_from, [('a', 'a')])

    def test_update_node_parents_bm_constructor(self):
        self.g = NaiveBayes([('a', 'b'), ('a', 'c')])
        self.assertListEqual(self.g.predecessors('a'), [])
        self.assertListEqual(self.g.predecessors('b'), ['a'])
        self.assertListEqual(self.g.predecessors('c'), ['a'])

    def test_update_node_parents(self):
        self.G.add_nodes_from(['a', 'b', 'c'])
        self.G.add_edges_from([('a', 'b'), ('a', 'c')])
        self.assertListEqual(self.G.predecessors('a'), [])
        self.assertListEqual(self.G.predecessors('b'), ['a'])
        self.assertListEqual(self.G.predecessors('c'), ['a'])

    def tearDown(self):
        del self.G
class TestNaiveBayesMethods(unittest.TestCase):
    def setUp(self):
        self.G1 = NaiveBayes([("a", "b"), ("a", "c"), ("a", "d"), ("a", "e")])
        self.G2 = NaiveBayes([("d", "g"), ("d", "l"), ("d", "s")])

    def test_local_independencies(self):
        self.assertEqual(self.G1.local_independencies("a"), Independencies())
        self.assertEqual(
            self.G1.local_independencies("b"),
            Independencies(["b", ["e", "c", "d"], "a"]),
        )
        self.assertEqual(
            self.G1.local_independencies("c"),
            Independencies(["c", ["e", "b", "d"], "a"]),
        )
        self.assertEqual(
            self.G1.local_independencies("d"),
            Independencies(["d", ["b", "c", "e"], "a"]),
        )

    def test_active_trail_nodes(self):
        self.assertListEqual(
            sorted(self.G2.active_trail_nodes("d")), ["d", "g", "l", "s"]
        )
        self.assertListEqual(
            sorted(self.G2.active_trail_nodes("g")), ["d", "g", "l", "s"]
        )
        self.assertListEqual(
            sorted(self.G2.active_trail_nodes("l")), ["d", "g", "l", "s"]
        )
        self.assertListEqual(
            sorted(self.G2.active_trail_nodes("s")), ["d", "g", "l", "s"]
        )

    def test_active_trail_nodes_args(self):
        self.assertListEqual(
            sorted(self.G2.active_trail_nodes("d", observed="g")), ["d", "l", "s"]
        )
        self.assertListEqual(
            sorted(self.G2.active_trail_nodes("l", observed="g")), ["d", "l", "s"]
        )
        self.assertListEqual(
            sorted(self.G2.active_trail_nodes("s", observed=["g", "l"])), ["d", "s"]
        )
        self.assertListEqual(
            sorted(self.G2.active_trail_nodes("s", observed=["d", "l"])), ["s"]
        )

    def test_get_ancestors_of(self):
        self.assertListEqual(sorted(self.G1._get_ancestors_of("b")), ["a", "b"])
        self.assertListEqual(sorted(self.G1._get_ancestors_of("e")), ["a", "e"])
        self.assertListEqual(sorted(self.G1._get_ancestors_of("a")), ["a"])
        self.assertListEqual(
            sorted(self.G1._get_ancestors_of(["b", "e"])), ["a", "b", "e"]
        )

    def tearDown(self):
        del self.G1
        del self.G2
 def setUp(self):
     self.G1 = NaiveBayes([("a", "b"), ("a", "c"), ("a", "d"), ("a", "e")])
     self.G2 = NaiveBayes([("d", "g"), ("d", "l"), ("d", "s")])
class TestBaseModelCreation(unittest.TestCase):
    def setUp(self):
        self.G = NaiveBayes()

    def test_class_init_without_data(self):
        self.assertIsInstance(self.G, nx.DiGraph)

    def test_class_init_with_data_string(self):
        self.g = NaiveBayes([("a", "b"), ("a", "c")])
        six.assertCountEqual(self, list(self.g.nodes()), ["a", "b", "c"])
        six.assertCountEqual(self, list(self.g.edges()), [("a", "b"), ("a", "c")])
        self.assertEqual(self.g.parent_node, "a")
        self.assertSetEqual(self.g.children_nodes, {"b", "c"})

        self.assertRaises(ValueError, NaiveBayes, [("a", "b"), ("b", "c")])
        self.assertRaises(ValueError, NaiveBayes, [("a", "b"), ("c", "b")])
        self.assertRaises(ValueError, NaiveBayes, [("a", "b"), ("d", "e")])

    def test_class_init_with_data_nonstring(self):
        self.g = NaiveBayes([(1, 2), (1, 3)])
        six.assertCountEqual(self, list(self.g.nodes()), [1, 2, 3])
        six.assertCountEqual(self, list(self.g.edges()), [(1, 2), (1, 3)])
        self.assertEqual(self.g.parent_node, 1)
        self.assertSetEqual(self.g.children_nodes, {2, 3})

        self.assertRaises(ValueError, NaiveBayes, [(1, 2), (2, 3)])
        self.assertRaises(ValueError, NaiveBayes, [(1, 2), (3, 2)])
        self.assertRaises(ValueError, NaiveBayes, [(1, 2), (3, 4)])

    def test_add_node_string(self):
        self.G.add_node("a")
        self.assertListEqual(list(self.G.nodes()), ["a"])

    def test_add_node_nonstring(self):
        self.G.add_node(1)
        self.assertListEqual(list(self.G.nodes()), [1])

    def test_add_nodes_from_string(self):
        self.G.add_nodes_from(["a", "b", "c", "d"])
        six.assertCountEqual(self, list(self.G.nodes()), ["a", "b", "c", "d"])

    def test_add_nodes_from_non_string(self):
        self.G.add_nodes_from([1, 2, 3, 4])
        six.assertCountEqual(self, list(self.G.nodes()), [1, 2, 3, 4])

    def test_add_edge_string(self):
        self.G.add_edge("a", "b")
        six.assertCountEqual(self, list(self.G.nodes()), ["a", "b"])
        self.assertListEqual(list(self.G.edges()), [("a", "b")])
        self.assertEqual(self.G.parent_node, "a")
        self.assertSetEqual(self.G.children_nodes, {"b"})

        self.G.add_nodes_from(["c", "d"])
        self.G.add_edge("a", "c")
        self.G.add_edge("a", "d")
        six.assertCountEqual(self, list(self.G.nodes()), ["a", "b", "c", "d"])
        six.assertCountEqual(
            self, list(self.G.edges()), [("a", "b"), ("a", "c"), ("a", "d")]
        )
        self.assertEqual(self.G.parent_node, "a")
        self.assertSetEqual(self.G.children_nodes, {"b", "c", "d"})

        self.assertRaises(ValueError, self.G.add_edge, "b", "c")
        self.assertRaises(ValueError, self.G.add_edge, "d", "f")
        self.assertRaises(ValueError, self.G.add_edge, "e", "f")
        self.assertRaises(ValueError, self.G.add_edges_from, [("a", "e"), ("b", "f")])
        self.assertRaises(ValueError, self.G.add_edges_from, [("b", "f")])

    def test_add_edge_nonstring(self):
        self.G.add_edge(1, 2)
        six.assertCountEqual(self, list(self.G.nodes()), [1, 2])
        self.assertListEqual(list(self.G.edges()), [(1, 2)])
        self.assertEqual(self.G.parent_node, 1)
        self.assertSetEqual(self.G.children_nodes, {2})

        self.G.add_nodes_from([3, 4])
        self.G.add_edge(1, 3)
        self.G.add_edge(1, 4)
        six.assertCountEqual(self, list(self.G.nodes()), [1, 2, 3, 4])
        six.assertCountEqual(self, list(self.G.edges()), [(1, 2), (1, 3), (1, 4)])
        self.assertEqual(self.G.parent_node, 1)
        self.assertSetEqual(self.G.children_nodes, {2, 3, 4})

        self.assertRaises(ValueError, self.G.add_edge, 2, 3)
        self.assertRaises(ValueError, self.G.add_edge, 3, 6)
        self.assertRaises(ValueError, self.G.add_edge, 5, 6)
        self.assertRaises(ValueError, self.G.add_edges_from, [(1, 5), (2, 6)])
        self.assertRaises(ValueError, self.G.add_edges_from, [(2, 6)])

    def test_add_edge_selfloop(self):
        self.assertRaises(ValueError, self.G.add_edge, "a", "a")
        self.assertRaises(ValueError, self.G.add_edge, 1, 1)

    def test_add_edges_from_self_loop(self):
        self.assertRaises(ValueError, self.G.add_edges_from, [("a", "a")])

    def test_update_node_parents_bm_constructor(self):
        self.g = NaiveBayes([("a", "b"), ("a", "c")])
        self.assertListEqual(list(self.g.predecessors("a")), [])
        self.assertListEqual(list(self.g.predecessors("b")), ["a"])
        self.assertListEqual(list(self.g.predecessors("c")), ["a"])

    def test_update_node_parents(self):
        self.G.add_nodes_from(["a", "b", "c"])
        self.G.add_edges_from([("a", "b"), ("a", "c")])
        self.assertListEqual(list(self.G.predecessors("a")), [])
        self.assertListEqual(list(self.G.predecessors("b")), ["a"])
        self.assertListEqual(list(self.G.predecessors("c")), ["a"])

    def tearDown(self):
        del self.G
Beispiel #28
0
 def test_update_node_parents_bm_constructor(self):
     self.g = NaiveBayes([('a', 'b'), ('a', 'c')])
     self.assertListEqual(self.g.predecessors('a'), [])
     self.assertListEqual(self.g.predecessors('b'), ['a'])
     self.assertListEqual(self.g.predecessors('c'), ['a'])
 def test_update_node_parents_bm_constructor(self):
     self.g = NaiveBayes([("a", "b"), ("a", "c")])
     self.assertListEqual(list(self.g.predecessors("a")), [])
     self.assertListEqual(list(self.g.predecessors("b")), ["a"])
     self.assertListEqual(list(self.g.predecessors("c")), ["a"])
Beispiel #30
0
 def setUp(self):
     self.G1 = NaiveBayes([('a', 'b'), ('a', 'c'),
                          ('a', 'd'), ('a', 'e')])
     self.G2 = NaiveBayes([('d', 'g'), ('d', 'l'),
                             ('d', 's')])
Beispiel #31
0
# Split the data to test and train
print("Data set size:", len(df))
print("\n\n............Splitting the data in test and train...........:\n")
test_size = 0.33
print("Test size = ", test_size)
data_train, data_test = train_test_split(df, test_size=test_size)
print("training data:", len(data_train))
print("test data:", len(data_test))
input("\n ")


#################################################################################
##### Defining the model
#################################################################################
model = NaiveBayes()

# Learning CPDs using Maximum Likelihood Estimators
model.fit(data_train, 'class', estimator=MaximumLikelihoodEstimator)
# Print the CPDs learned
print("\n\n............Selected CPDs from the fit...........:\n")
print('CPD: class (parent of all nodes)')
print(model.get_cpds('class'))
input("\n ")
print('\nCPD: sex')
print(model.get_cpds('sex'))
# print(model.get_cpds('race'))

input("\n ")

# print("\n\n............Overview of levels in variables...........:\n")
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder


col_names = pd.read_csv('data/names.csv')  # 'data/names.csv'
data = pd.read_csv('data/breast-cancer-wisconsin.data', names=col_names.columns)
data = data[data["bare_nuclei"] != '?']
data.set_index('id', inplace=True) #stop the model from using id as a node

train, test = train_test_split(data, test_size=0.2, random_state=0)
Y_test = test['class']
test = test.drop(['class'], axis=1)

#fit model
model = NaiveBayes()
model.fit(train, 'class')
print("Naive Bayes edges:        ", model.edges())

#make predictions
Y_pred = model.predict(test)

#Convert Labels so we can use sklearn function to evaluate our model
labelencoder = LabelEncoder()
Y_test = labelencoder.fit_transform(Y_test.values.ravel())
Y_pred = labelencoder.fit_transform(Y_pred.values.ravel())

# Output results
accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)