def test_class_init_with_data_nonstring(self): self.g = NaiveBayes([(1, 2), (1, 3)]) six.assertCountEqual(self, self.g.nodes(), [1, 2, 3]) six.assertCountEqual(self, self.g.edges(), [(1, 2), (1, 3)]) self.assertEqual(self.g.parent_node, 1) self.assertSetEqual(self.g.children_nodes, {2, 3}) self.assertRaises(ValueError, NaiveBayes, [(1, 2), (2, 3)]) self.assertRaises(ValueError, NaiveBayes, [(1, 2), (3, 2)]) self.assertRaises(ValueError, NaiveBayes, [(1, 2), (3, 4)])
def test_class_init_with_data_string(self): self.g = NaiveBayes([('a', 'b'), ('a', 'c')]) six.assertCountEqual(self, self.g.nodes(), ['a', 'b', 'c']) six.assertCountEqual(self, self.g.edges(), [('a', 'b'), ('a', 'c')]) self.assertEqual(self.g.parent_node, 'a') self.assertSetEqual(self.g.children_nodes, {'b', 'c'}) self.assertRaises(ValueError, NaiveBayes, [('a', 'b'), ('b', 'c')]) self.assertRaises(ValueError, NaiveBayes, [('a', 'b'), ('c', 'b')]) self.assertRaises(ValueError, NaiveBayes, [('a', 'b'), ('d', 'e')])
def test_class_init_with_data_string(self): self.g = NaiveBayes([("a", "b"), ("a", "c")]) six.assertCountEqual(self, list(self.g.nodes()), ["a", "b", "c"]) six.assertCountEqual(self, list(self.g.edges()), [("a", "b"), ("a", "c")]) self.assertEqual(self.g.parent_node, "a") self.assertSetEqual(self.g.children_nodes, {"b", "c"}) self.assertRaises(ValueError, NaiveBayes, [("a", "b"), ("b", "c")]) self.assertRaises(ValueError, NaiveBayes, [("a", "b"), ("c", "b")]) self.assertRaises(ValueError, NaiveBayes, [("a", "b"), ("d", "e")])
class TestNaiveBayesMethods(unittest.TestCase): def setUp(self): self.G1 = NaiveBayes([('a', 'b'), ('a', 'c'), ('a', 'd'), ('a', 'e')]) self.G2 = NaiveBayes([('d', 'g'), ('d', 'l'), ('d', 's')]) def test_local_independencies(self): self.assertEqual(self.G1.local_independencies('a'), Independencies()) self.assertEqual(self.G1.local_independencies('b'), Independencies(['b', ['e', 'c', 'd'], 'a'])) self.assertEqual(self.G1.local_independencies('c'), Independencies(['c', ['e', 'b', 'd'], 'a'])) self.assertEqual(self.G1.local_independencies('d'), Independencies(['d', ['b', 'c', 'e'], 'a'])) def test_active_trail_nodes(self): self.assertListEqual(sorted(self.G2.active_trail_nodes('d')), ['d', 'g', 'l', 's']) self.assertListEqual(sorted(self.G2.active_trail_nodes('g')), ['d', 'g', 'l', 's']) self.assertListEqual(sorted(self.G2.active_trail_nodes('l')), ['d', 'g', 'l', 's']) self.assertListEqual(sorted(self.G2.active_trail_nodes('s')), ['d', 'g', 'l', 's']) def test_active_trail_nodes_args(self): self.assertListEqual(sorted(self.G2.active_trail_nodes('d', observed='g')), ['d', 'l', 's']) self.assertListEqual(sorted(self.G2.active_trail_nodes('l', observed='g')), ['d', 'l', 's']) self.assertListEqual(sorted(self.G2.active_trail_nodes('s', observed=['g', 'l'])), ['d', 's']) self.assertListEqual(sorted(self.G2.active_trail_nodes('s', observed=['d', 'l'])), ['s']) def test_get_ancestors_of(self): self.assertListEqual(sorted(self.G1._get_ancestors_of('b')), ['a', 'b']) self.assertListEqual(sorted(self.G1._get_ancestors_of('e')), ['a', 'e']) self.assertListEqual(sorted(self.G1._get_ancestors_of('a')), ['a']) self.assertListEqual(sorted(self.G1._get_ancestors_of(['b', 'e'])), ['a', 'b', 'e']) def tearDown(self): del self.G1 del self.G2
def reset(self) -> PGMNaiveBayes: '''Totally reset the Classifier''' self.categories = {} self.tokens = {} self.cardinality = 1 self.total_documents = 0 self.total_tokens = 0 self.cpd_class = None self.model = NaiveBayes() self.model.add_node(Data.CATEGORY_NAME) return self
def Bayesian_Net_Model(data): cols = data.columns.values n_cols = len(data.columns.values) BN_Model = NaiveBayes() BN_Model.add_nodes_from(cols) edges = [] for i in cols: if (i != "Overall"): edge = ["Overall", i] edges.append(edge) BN_Model.add_edges_from(edges) print("Aggiunti Archi e Nodi \n\n") data_cpts = Compute_CPT(data, "Overall") CPTS_list = Generate_CPTs(data_cpts, data, cols, 'Overall') test_list = [None] * len(CPTS_list) for i in CPTS_list: BN_Model.add_cpds(i) print("Aggiunte CPD \n\n") return BN_Model
class TestNaiveBayesFit(unittest.TestCase): def setUp(self): self.model1 = NaiveBayes() self.model2 = NaiveBayes([("A", "B")]) def test_fit_model_creation(self): values = pd.DataFrame( np.random.randint(low=0, high=2, size=(1000, 5)), columns=["A", "B", "C", "D", "E"], ) self.model1.fit(values, "A") six.assertCountEqual(self, self.model1.nodes(), ["A", "B", "C", "D", "E"]) six.assertCountEqual( self, self.model1.edges(), [("A", "B"), ("A", "C"), ("A", "D"), ("A", "E")] ) self.assertEqual(self.model1.parent_node, "A") self.assertSetEqual(self.model1.children_nodes, {"B", "C", "D", "E"}) self.model2.fit(values) six.assertCountEqual(self, self.model1.nodes(), ["A", "B", "C", "D", "E"]) six.assertCountEqual( self, self.model1.edges(), [("A", "B"), ("A", "C"), ("A", "D"), ("A", "E")] ) self.assertEqual(self.model2.parent_node, "A") self.assertSetEqual(self.model2.children_nodes, {"B", "C", "D", "E"}) def test_fit_model_creation_exception(self): values = pd.DataFrame( np.random.randint(low=0, high=2, size=(1000, 5)), columns=["A", "B", "C", "D", "E"], ) values2 = pd.DataFrame( np.random.randint(low=0, high=2, size=(1000, 3)), columns=["C", "D", "E"] ) self.assertRaises(ValueError, self.model1.fit, values) self.assertRaises(ValueError, self.model1.fit, values2) self.assertRaises(ValueError, self.model2.fit, values2, "A") def tearDown(self): del self.model1 del self.model2
class TestNaiveBayesFit(unittest.TestCase): def setUp(self): self.model1 = NaiveBayes() self.model2 = NaiveBayes([('A', 'B')]) def test_fit_model_creation(self): values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 5)), columns=['A', 'B', 'C', 'D', 'E']) self.model1.fit(values, 'A') six.assertCountEqual(self, self.model1.nodes(), ['A', 'B', 'C', 'D', 'E']) six.assertCountEqual(self, self.model1.edges(), [('A', 'B'), ('A', 'C'), ('A', 'D'), ('A', 'E')]) self.assertEqual(self.model1.parent_node, 'A') self.assertSetEqual(self.model1.children_nodes, {'B', 'C', 'D', 'E'}) self.model2.fit(values) six.assertCountEqual(self, self.model1.nodes(), ['A', 'B', 'C', 'D', 'E']) six.assertCountEqual(self, self.model1.edges(), [('A', 'B'), ('A', 'C'), ('A', 'D'), ('A', 'E')]) self.assertEqual(self.model2.parent_node, 'A') self.assertSetEqual(self.model2.children_nodes, {'B', 'C', 'D', 'E'}) def test_fit_model_creation_exception(self): values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 5)), columns=['A', 'B', 'C', 'D', 'E']) values2 = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 3)), columns=['C', 'D', 'E']) self.assertRaises(ValueError, self.model1.fit, values) self.assertRaises(ValueError, self.model1.fit, values2) self.assertRaises(ValueError, self.model2.fit, values2, 'A') def tearDown(self): del self.model1 del self.model2
class TestNaiveBayesFit(unittest.TestCase): def setUp(self): self.model1 = NaiveBayes() self.model2 = NaiveBayes([('A','B')]) def test_fit_model_creation(self): values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 5)), columns=['A', 'B', 'C', 'D', 'E']) self.model1.fit(values, 'A') six.assertCountEqual(self, self.model1.nodes(), ['A', 'B', 'C', 'D', 'E']) six.assertCountEqual(self, self.model1.edges(), [('A', 'B'), ('A', 'C'), ('A', 'D'), ('A', 'E')]) self.assertEqual(self.model1.parent_node, 'A') self.assertSetEqual(self.model1.children_nodes, {'B','C','D','E'}) self.model2.fit(values) six.assertCountEqual(self, self.model1.nodes(), ['A', 'B', 'C', 'D', 'E']) six.assertCountEqual(self, self.model1.edges(), [('A', 'B'), ('A', 'C'), ('A', 'D'), ('A', 'E')]) self.assertEqual(self.model2.parent_node, 'A') self.assertSetEqual(self.model2.children_nodes, {'B','C','D','E'}) def test_fit_model_creation_exception(self): values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 5)), columns=['A', 'B', 'C', 'D', 'E']) values2 = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 3)), columns=['C', 'D', 'E']) self.assertRaises(ValueError, self.model1.fit, values) self.assertRaises(ValueError, self.model1.fit, values2) self.assertRaises(ValueError, self.model2.fit, values2, 'A') def tearDown(self): del self.model1 del self.model2
def setUp(self): self.model1 = NaiveBayes() self.model2 = NaiveBayes([('A','B')])
def setUp(self): self.G = NaiveBayes()
def test_update_node_parents_bm_constructor(self): self.g = NaiveBayes([('a', 'b'), ('a', 'c')]) self.assertListEqual(self.g.predecessors('a'), []) self.assertListEqual(self.g.predecessors('b'), ['a']) self.assertListEqual(self.g.predecessors('c'), ['a'])
class TestBaseModelCreation(unittest.TestCase): def setUp(self): self.G = NaiveBayes() def test_class_init_without_data(self): self.assertIsInstance(self.G, nx.DiGraph) def test_class_init_with_data_string(self): self.g = NaiveBayes([('a', 'b'), ('a', 'c')]) six.assertCountEqual(self, self.g.nodes(), ['a', 'b', 'c']) six.assertCountEqual(self, self.g.edges(), [('a', 'b'), ('a', 'c')]) self.assertEqual(self.g.parent_node, 'a') self.assertSetEqual(self.g.children_nodes, {'b', 'c'}) self.assertRaises(ValueError, NaiveBayes, [('a', 'b'), ('b', 'c')]) self.assertRaises(ValueError, NaiveBayes, [('a', 'b'), ('c', 'b')]) self.assertRaises(ValueError, NaiveBayes, [('a', 'b'), ('d', 'e')]) def test_class_init_with_data_nonstring(self): self.g = NaiveBayes([(1, 2), (1, 3)]) six.assertCountEqual(self, self.g.nodes(), [1, 2, 3]) six.assertCountEqual(self, self.g.edges(), [(1, 2), (1, 3)]) self.assertEqual(self.g.parent_node, 1) self.assertSetEqual(self.g.children_nodes, {2, 3}) self.assertRaises(ValueError, NaiveBayes, [(1, 2), (2, 3)]) self.assertRaises(ValueError, NaiveBayes, [(1, 2), (3, 2)]) self.assertRaises(ValueError, NaiveBayes, [(1, 2), (3, 4)]) def test_add_node_string(self): self.G.add_node('a') self.assertListEqual(self.G.nodes(), ['a']) def test_add_node_nonstring(self): self.G.add_node(1) self.assertListEqual(self.G.nodes(), [1]) def test_add_nodes_from_string(self): self.G.add_nodes_from(['a', 'b', 'c', 'd']) six.assertCountEqual(self, self.G.nodes(), ['a', 'b', 'c', 'd']) def test_add_nodes_from_non_string(self): self.G.add_nodes_from([1, 2, 3, 4]) six.assertCountEqual(self, self.G.nodes(), [1, 2, 3, 4]) def test_add_edge_string(self): self.G.add_edge('a', 'b') six.assertCountEqual(self, self.G.nodes(), ['a', 'b']) self.assertListEqual(self.G.edges(), [('a', 'b')]) self.assertEqual(self.G.parent_node, 'a') self.assertSetEqual(self.G.children_nodes, {'b'}) self.G.add_nodes_from(['c', 'd']) self.G.add_edge('a', 'c') self.G.add_edge('a', 'd') six.assertCountEqual(self, self.G.nodes(), ['a', 'b', 'c', 'd']) six.assertCountEqual(self, self.G.edges(), [('a', 'b'), ('a', 'c'), ('a', 'd')]) self.assertEqual(self.G.parent_node, 'a') self.assertSetEqual(self.G.children_nodes, {'b', 'c', 'd'}) self.assertRaises(ValueError, self.G.add_edge, 'b', 'c') self.assertRaises(ValueError, self.G.add_edge, 'd', 'f') self.assertRaises(ValueError, self.G.add_edge, 'e', 'f') self.assertRaises(ValueError, self.G.add_edges_from, [('a', 'e'), ('b', 'f')]) self.assertRaises(ValueError, self.G.add_edges_from, [('b', 'f')]) def test_add_edge_nonstring(self): self.G.add_edge(1, 2) six.assertCountEqual(self, self.G.nodes(), [1, 2]) self.assertListEqual(self.G.edges(), [(1, 2)]) self.assertEqual(self.G.parent_node, 1) self.assertSetEqual(self.G.children_nodes, {2}) self.G.add_nodes_from([3, 4]) self.G.add_edge(1, 3) self.G.add_edge(1, 4) six.assertCountEqual(self, self.G.nodes(), [1, 2, 3, 4]) six.assertCountEqual(self, self.G.edges(), [(1, 2), (1, 3), (1, 4)]) self.assertEqual(self.G.parent_node, 1) self.assertSetEqual(self.G.children_nodes, {2, 3, 4}) self.assertRaises(ValueError, self.G.add_edge, 2, 3) self.assertRaises(ValueError, self.G.add_edge, 3, 6) self.assertRaises(ValueError, self.G.add_edge, 5, 6) self.assertRaises(ValueError, self.G.add_edges_from, [(1, 5), (2, 6)]) self.assertRaises(ValueError, self.G.add_edges_from, [(2, 6)]) def test_add_edge_selfloop(self): self.assertRaises(ValueError, self.G.add_edge, 'a', 'a') self.assertRaises(ValueError, self.G.add_edge, 1, 1) def test_add_edges_from_self_loop(self): self.assertRaises(ValueError, self.G.add_edges_from, [('a', 'a')]) def test_update_node_parents_bm_constructor(self): self.g = NaiveBayes([('a', 'b'), ('a', 'c')]) self.assertListEqual(self.g.predecessors('a'), []) self.assertListEqual(self.g.predecessors('b'), ['a']) self.assertListEqual(self.g.predecessors('c'), ['a']) def test_update_node_parents(self): self.G.add_nodes_from(['a', 'b', 'c']) self.G.add_edges_from([('a', 'b'), ('a', 'c')]) self.assertListEqual(self.G.predecessors('a'), []) self.assertListEqual(self.G.predecessors('b'), ['a']) self.assertListEqual(self.G.predecessors('c'), ['a']) def tearDown(self): del self.G
class PGMNaiveBayes(TextClassifier): def __add_category( self, categories: Union[str, List[str], List[(str, str)], Dict[str, str]] ) -> PGMNaiveBayes: '''setup the bayes network with a new category entry''' if type(categories) is str: categories = [categories] if type(categories) is dict: categories = categories.items() to_create = False for category in categories: if type(category) is str: category = (category, category) category, index = category if category not in self.categories: self.categories[category] = index self.cardinality = len(self.categories) or 1 to_create = True if to_create: self.__create_class_cpd() return self def __add_token(self, tokens: Union[str, List[str]]) -> PGMNaiveBayes: if type(tokens) is str: tokens = [tokens] to_create = [] for token in tokens: if token not in self.tokens: to_create.append(token) self.total_tokens += 1 self.__create_word_cpd(to_create) return self def __create_word_cpd(self, tokens: Union[str, List[str]], check: bool = True) -> PGMNaiveBayes: '''Generate the table for the given token node''' if type(tokens) is str: tokens = [tokens] cpds = [] for token in tokens: if token in self.tokens: self.model.remove_cpds(self.tokens[token]) cpd_word = TabularCPD( variable=token, variable_card=2, evidence=[Data.CATEGORY_NAME], evidence_card=[self.cardinality], values=[[0.5 for _ in range(self.cardinality)]] * 2) self.tokens[token] = cpd_word cpds.append(cpd_word) self.model.add_nodes_from(tokens) self.model.add_edges_from([(Data.CATEGORY_NAME, token) for token in tokens]) self.model.add_cpds(*cpds) # if check: self.model.check_model() return self def __create_class_cpd(self, check: bool = True) -> PGMNaiveBayes: '''Generate the table for the category node''' if self.cpd_class: self.model.remove_cpds(self.cpd_class) self.cpd_class = TabularCPD(variable=Data.CATEGORY_NAME, variable_card=self.cardinality, values=[[1 / self.cardinality] for _ in range(self.cardinality)]) self.model.add_cpds(self.cpd_class) # if check: self.model.check_model() return self def __cpd_to_json(self, cpd: TabularCPD) -> Dict: return { 'variable': cpd.variable, 'variables': cpd.variables, 'variable_card': cpd.variable_card.tolist(), 'values': cpd.values.tolist() } def __cpd_from_json(self, cpd: Dict) -> TabularCPD: return TabularCPD(**cpd) def reset(self) -> PGMNaiveBayes: '''Totally reset the Classifier''' self.categories = {} self.tokens = {} self.cardinality = 1 self.total_documents = 0 self.total_tokens = 0 self.cpd_class = None self.model = NaiveBayes() self.model.add_node(Data.CATEGORY_NAME) return self def token_probability(self, token: str, category: str) -> float: '''return the probability of a given token to belong a given category''' probability = self.model.predict_probability( pd.DataFrame([[1]], columns=[token])) column = '{}_{}'.format(Data.CATEGORY_NAME, self.categories.get(category, 0)) return probability[column][0] if column in probability else 0 def category_probability(self, category: str) -> float: '''return the probability of the given category''' score = Data.CATEGORY_VALUES.get(category, 0) elimination = VariableElimination(self.model) probability = elimination.query(variables=[Data.CATEGORY_NAME]) state = probability.get_state_no(Data.CATEGORY_NAME, self.categories.get(category, 0)) return probability.values[state] def word_probability(self, text: str) -> pd.DataFrame: '''retrive the probability table of the given text without knowing the probability of the category (no evidence): P(C | w1,...,wn)''' data = Data(text) elimination = VariableElimination(self.model) values = [[] for _ in range(self.cardinality)] for token in data.tokens: if token not in self.tokens: for v in values: v.append(1 / (self.cardinality or 1)) else: probability = elimination.query(variables=[Data.CATEGORY_NAME], evidence={ token: 1 }).values for i in range(len(probability)): values[i].append(probability[i]) return pd.DataFrame(np.array(values).T, columns=list(self.categories), index=data.tokens) def probability(self, text: str) -> pd.DataFrame: '''retrive the probability table of the given text knowing the probability of categories: P(C) * P(C | w1,...,wn)''' data = Data(text) values = pd.DataFrame( [[1 if t in data.table else 0 for t in self.tokens]], columns=self.tokens) probabilities = self.model.predict_probability(values) return probabilities.rename( columns={ '{}_{}'.format(Data.CATEGORY_NAME, v): k for k, v in self.categories.items() }) def fit(self, text: Union[str, Iterable[str], Iterable[Data], pd.DataFrame], category: Union[str, Iterable[str]] = None) -> TextClassifier: '''learn probabilities for tokens extracted by the given text''' data = DataSet.FromAny(text, category) categories = [] tokens = {} values = [] for d in data: categories.append((d.category, d.score)) for token in d.tokens: tokens[token] = 1 values.append((d.table, d.score)) self.total_documents += 1 tokens = list(tokens) self.__add_category(categories) self.__add_token(tokens) data_values = [[1 if t in v[0] else 0 for t in tokens] + [v[1]] for v in values] tokens.append(Data.CATEGORY_NAME) data_values = pd.DataFrame(data_values, columns=tokens) self.model.fit(data_values, Data.CATEGORY_NAME) return self def words(self, categories: Union[str, Iterable[str]]) -> pd.DataFrame: '''return a sorted by probability table with tokens as rows and categories as columns, for the given categories''' elimination = VariableElimination(self.model) values = [[] for _ in range(self.cardinality)] for token in self.tokens: probability = elimination.query(variables=[Data.CATEGORY_NAME], evidence={ token: 1 }).values for i in range(len(probability)): values[i].append(probability[i]) return pd.DataFrame(np.array(values).T, columns=list(self.categories), index=list(self.tokens)) def to_json(self) -> Dict: return { 'categories': self.categories, 'total_documents': self.total_documents, 'tokens': { c.variable: c.values.tolist() for c in self.model.get_cpds() if c.variable != Data.CATEGORY_NAME }, } def from_json(self, data: Dict) -> PGMNaiveBayes: self.total_documents = data.get('total_documents', self.total_documents) self.__add_category(data.get('categories', {})) self.model.remove_cpds(self.cpd_class) self.cpd_class = TabularCPD( **data.get('class')) if 'class' in data else self.cpd_class self.model.add_cpds(self.cpd_class) tokens = data.get('tokens', {}) self.__add_token(list(tokens)) cpds = {c.variable: c for c in self.model.get_cpds()} for token, values in tokens.items(): if token in cpds: cpds[token].values = np.array(values)[0:self.cardinality, 0:self.cardinality] self.model.check_model() return self def __str__(self) -> str: return 'NaiveBayes<{}, {}>[{}]'.format(self.total_documents, self.total_tokens, str.join(', ', self.categories)) def __repr__(self) -> str: return str(self)
# Print an example of 1 instance in the dataset print("\nAn example of a person") print(df.iloc[0]) # Split the data to test and train test_size = 0.33 print("\nSplitting in to training and test data using: Test size = ", test_size) data_train, data_test = train_test_split(df, test_size=test_size) print("training data:", len(data_train)) print("test data:", len(data_test)) ################################################################################# ##### Defining the model ################################################################################# model = NaiveBayes() # Learning CPDs using Maximum Likelihood Estimators model.fit(data_train, 'class', estimator=MaximumLikelihoodEstimator) # Print the CPDs learned print("\n\n............Overview of our CPDs from the fit...........:") for cpd in model.get_cpds(): print("CPD of {variable}:".format(variable=cpd.variable)) print(cpd) print("\n\n............Overview of levels in variables...........:") for col in df: print(col,":", len(df[col].unique()) ) ################################################################################# ##### Using the model to query
def setUp(self): self.model1 = NaiveBayes() self.model2 = NaiveBayes([('A', 'B')])
def setUp(self): self.model1 = NaiveBayes() self.model2 = NaiveBayes([("A", "B")])
def setUp(self): self.G1 = NaiveBayes([('a', 'b'), ('a', 'c'), ('a', 'd'), ('a', 'e')]) self.G2 = NaiveBayes([('d', 'g'), ('d', 'l'), ('d', 's')])
CPD = pickle.load(fp) with open("RandomColumns.txt", "rb") as fp: random_columns = pickle.load(fp) with open("RandomIndices.txt", "rb") as fp: random_indices = pickle.load(fp) data = data.iloc[:, random_columns] column_size = data.shape[1] #Delete invoices with all zeros from the data data = data[(data.T != 0).any()] row_size = data.shape[0] smallDF = data.iloc[random_indices, :] smallDF.shape DictOfModels = {} for productName in smallDF.columns: print('Collecting model for {0}'.format(productName)) model = NaiveBayes() model.add_nodes_from(Nodes[productName]) model.add_edges_from(Edges[productName]) model.add_cpds(*CPD[productName]) DictOfModels[productName] = model #Save edge ,node, CPD information PseudoCounts = {} #Pseudocounts are given (1,1) for uniform for productName in smallDF.columns: PseudoCounts[productName] = [1, 1] except: print('Existing model not found') #Select random invoice (2000) and products (50) seed(0) column_size = data.shape[1] random_columns = sample(range(column_size), 100)
class TestNaiveBayesMethods(unittest.TestCase): def setUp(self): self.G1 = NaiveBayes([('a', 'b'), ('a', 'c'), ('a', 'd'), ('a', 'e')]) self.G2 = NaiveBayes([('d', 'g'), ('d', 'l'), ('d', 's')]) def test_local_independencies(self): self.assertListEqual(self.G1.local_independencies('a'), [None]) self.assertListEqual(self.G1.local_independencies('b'), [Independencies(['b', ['e', 'c', 'd'], 'a'])]) self.assertListEqual(self.G1.local_independencies('c'), [Independencies(['c', ['e', 'b', 'd'], 'a'])]) self.assertListEqual(self.G1.local_independencies('d'), [Independencies(['d', ['b', 'c', 'e'], 'a'])]) def test_active_trail_nodes(self): self.assertListEqual(sorted(self.G2.active_trail_nodes('d')), ['d', 'g', 'l', 's']) self.assertListEqual(sorted(self.G2.active_trail_nodes('g')), ['d', 'g', 'l', 's']) self.assertListEqual(sorted(self.G2.active_trail_nodes('l')), ['d', 'g', 'l', 's']) self.assertListEqual(sorted(self.G2.active_trail_nodes('s')), ['d', 'g', 'l', 's']) def test_active_trail_nodes_args(self): self.assertListEqual( sorted(self.G2.active_trail_nodes('d', observed='g')), ['d', 'l', 's']) self.assertListEqual( sorted(self.G2.active_trail_nodes('l', observed='g')), ['d', 'l', 's']) self.assertListEqual( sorted(self.G2.active_trail_nodes('s', observed=['g', 'l'])), ['d', 's']) self.assertListEqual( sorted(self.G2.active_trail_nodes('s', observed=['d', 'l'])), ['s']) def tearDown(self): del self.G1 del self.G2
class TestNaiveBayesMethods(unittest.TestCase): def setUp(self): self.G1 = NaiveBayes([("a", "b"), ("a", "c"), ("a", "d"), ("a", "e")]) self.G2 = NaiveBayes([("d", "g"), ("d", "l"), ("d", "s")]) def test_local_independencies(self): self.assertEqual(self.G1.local_independencies("a"), Independencies()) self.assertEqual( self.G1.local_independencies("b"), Independencies(["b", ["e", "c", "d"], "a"]), ) self.assertEqual( self.G1.local_independencies("c"), Independencies(["c", ["e", "b", "d"], "a"]), ) self.assertEqual( self.G1.local_independencies("d"), Independencies(["d", ["b", "c", "e"], "a"]), ) def test_active_trail_nodes(self): self.assertListEqual( sorted(self.G2.active_trail_nodes("d")), ["d", "g", "l", "s"] ) self.assertListEqual( sorted(self.G2.active_trail_nodes("g")), ["d", "g", "l", "s"] ) self.assertListEqual( sorted(self.G2.active_trail_nodes("l")), ["d", "g", "l", "s"] ) self.assertListEqual( sorted(self.G2.active_trail_nodes("s")), ["d", "g", "l", "s"] ) def test_active_trail_nodes_args(self): self.assertListEqual( sorted(self.G2.active_trail_nodes("d", observed="g")), ["d", "l", "s"] ) self.assertListEqual( sorted(self.G2.active_trail_nodes("l", observed="g")), ["d", "l", "s"] ) self.assertListEqual( sorted(self.G2.active_trail_nodes("s", observed=["g", "l"])), ["d", "s"] ) self.assertListEqual( sorted(self.G2.active_trail_nodes("s", observed=["d", "l"])), ["s"] ) def test_get_ancestors_of(self): self.assertListEqual(sorted(self.G1._get_ancestors_of("b")), ["a", "b"]) self.assertListEqual(sorted(self.G1._get_ancestors_of("e")), ["a", "e"]) self.assertListEqual(sorted(self.G1._get_ancestors_of("a")), ["a"]) self.assertListEqual( sorted(self.G1._get_ancestors_of(["b", "e"])), ["a", "b", "e"] ) def tearDown(self): del self.G1 del self.G2
def setUp(self): self.G1 = NaiveBayes([("a", "b"), ("a", "c"), ("a", "d"), ("a", "e")]) self.G2 = NaiveBayes([("d", "g"), ("d", "l"), ("d", "s")])
class TestBaseModelCreation(unittest.TestCase): def setUp(self): self.G = NaiveBayes() def test_class_init_without_data(self): self.assertIsInstance(self.G, nx.DiGraph) def test_class_init_with_data_string(self): self.g = NaiveBayes([("a", "b"), ("a", "c")]) six.assertCountEqual(self, list(self.g.nodes()), ["a", "b", "c"]) six.assertCountEqual(self, list(self.g.edges()), [("a", "b"), ("a", "c")]) self.assertEqual(self.g.parent_node, "a") self.assertSetEqual(self.g.children_nodes, {"b", "c"}) self.assertRaises(ValueError, NaiveBayes, [("a", "b"), ("b", "c")]) self.assertRaises(ValueError, NaiveBayes, [("a", "b"), ("c", "b")]) self.assertRaises(ValueError, NaiveBayes, [("a", "b"), ("d", "e")]) def test_class_init_with_data_nonstring(self): self.g = NaiveBayes([(1, 2), (1, 3)]) six.assertCountEqual(self, list(self.g.nodes()), [1, 2, 3]) six.assertCountEqual(self, list(self.g.edges()), [(1, 2), (1, 3)]) self.assertEqual(self.g.parent_node, 1) self.assertSetEqual(self.g.children_nodes, {2, 3}) self.assertRaises(ValueError, NaiveBayes, [(1, 2), (2, 3)]) self.assertRaises(ValueError, NaiveBayes, [(1, 2), (3, 2)]) self.assertRaises(ValueError, NaiveBayes, [(1, 2), (3, 4)]) def test_add_node_string(self): self.G.add_node("a") self.assertListEqual(list(self.G.nodes()), ["a"]) def test_add_node_nonstring(self): self.G.add_node(1) self.assertListEqual(list(self.G.nodes()), [1]) def test_add_nodes_from_string(self): self.G.add_nodes_from(["a", "b", "c", "d"]) six.assertCountEqual(self, list(self.G.nodes()), ["a", "b", "c", "d"]) def test_add_nodes_from_non_string(self): self.G.add_nodes_from([1, 2, 3, 4]) six.assertCountEqual(self, list(self.G.nodes()), [1, 2, 3, 4]) def test_add_edge_string(self): self.G.add_edge("a", "b") six.assertCountEqual(self, list(self.G.nodes()), ["a", "b"]) self.assertListEqual(list(self.G.edges()), [("a", "b")]) self.assertEqual(self.G.parent_node, "a") self.assertSetEqual(self.G.children_nodes, {"b"}) self.G.add_nodes_from(["c", "d"]) self.G.add_edge("a", "c") self.G.add_edge("a", "d") six.assertCountEqual(self, list(self.G.nodes()), ["a", "b", "c", "d"]) six.assertCountEqual( self, list(self.G.edges()), [("a", "b"), ("a", "c"), ("a", "d")] ) self.assertEqual(self.G.parent_node, "a") self.assertSetEqual(self.G.children_nodes, {"b", "c", "d"}) self.assertRaises(ValueError, self.G.add_edge, "b", "c") self.assertRaises(ValueError, self.G.add_edge, "d", "f") self.assertRaises(ValueError, self.G.add_edge, "e", "f") self.assertRaises(ValueError, self.G.add_edges_from, [("a", "e"), ("b", "f")]) self.assertRaises(ValueError, self.G.add_edges_from, [("b", "f")]) def test_add_edge_nonstring(self): self.G.add_edge(1, 2) six.assertCountEqual(self, list(self.G.nodes()), [1, 2]) self.assertListEqual(list(self.G.edges()), [(1, 2)]) self.assertEqual(self.G.parent_node, 1) self.assertSetEqual(self.G.children_nodes, {2}) self.G.add_nodes_from([3, 4]) self.G.add_edge(1, 3) self.G.add_edge(1, 4) six.assertCountEqual(self, list(self.G.nodes()), [1, 2, 3, 4]) six.assertCountEqual(self, list(self.G.edges()), [(1, 2), (1, 3), (1, 4)]) self.assertEqual(self.G.parent_node, 1) self.assertSetEqual(self.G.children_nodes, {2, 3, 4}) self.assertRaises(ValueError, self.G.add_edge, 2, 3) self.assertRaises(ValueError, self.G.add_edge, 3, 6) self.assertRaises(ValueError, self.G.add_edge, 5, 6) self.assertRaises(ValueError, self.G.add_edges_from, [(1, 5), (2, 6)]) self.assertRaises(ValueError, self.G.add_edges_from, [(2, 6)]) def test_add_edge_selfloop(self): self.assertRaises(ValueError, self.G.add_edge, "a", "a") self.assertRaises(ValueError, self.G.add_edge, 1, 1) def test_add_edges_from_self_loop(self): self.assertRaises(ValueError, self.G.add_edges_from, [("a", "a")]) def test_update_node_parents_bm_constructor(self): self.g = NaiveBayes([("a", "b"), ("a", "c")]) self.assertListEqual(list(self.g.predecessors("a")), []) self.assertListEqual(list(self.g.predecessors("b")), ["a"]) self.assertListEqual(list(self.g.predecessors("c")), ["a"]) def test_update_node_parents(self): self.G.add_nodes_from(["a", "b", "c"]) self.G.add_edges_from([("a", "b"), ("a", "c")]) self.assertListEqual(list(self.G.predecessors("a")), []) self.assertListEqual(list(self.G.predecessors("b")), ["a"]) self.assertListEqual(list(self.G.predecessors("c")), ["a"]) def tearDown(self): del self.G
def test_update_node_parents_bm_constructor(self): self.g = NaiveBayes([("a", "b"), ("a", "c")]) self.assertListEqual(list(self.g.predecessors("a")), []) self.assertListEqual(list(self.g.predecessors("b")), ["a"]) self.assertListEqual(list(self.g.predecessors("c")), ["a"])
# Split the data to test and train print("Data set size:", len(df)) print("\n\n............Splitting the data in test and train...........:\n") test_size = 0.33 print("Test size = ", test_size) data_train, data_test = train_test_split(df, test_size=test_size) print("training data:", len(data_train)) print("test data:", len(data_test)) input("\n ") ################################################################################# ##### Defining the model ################################################################################# model = NaiveBayes() # Learning CPDs using Maximum Likelihood Estimators model.fit(data_train, 'class', estimator=MaximumLikelihoodEstimator) # Print the CPDs learned print("\n\n............Selected CPDs from the fit...........:\n") print('CPD: class (parent of all nodes)') print(model.get_cpds('class')) input("\n ") print('\nCPD: sex') print(model.get_cpds('sex')) # print(model.get_cpds('race')) input("\n ") # print("\n\n............Overview of levels in variables...........:\n")
from sklearn.metrics import precision_score from sklearn.metrics import f1_score from sklearn.preprocessing import LabelEncoder col_names = pd.read_csv('data/names.csv') # 'data/names.csv' data = pd.read_csv('data/breast-cancer-wisconsin.data', names=col_names.columns) data = data[data["bare_nuclei"] != '?'] data.set_index('id', inplace=True) #stop the model from using id as a node train, test = train_test_split(data, test_size=0.2, random_state=0) Y_test = test['class'] test = test.drop(['class'], axis=1) #fit model model = NaiveBayes() model.fit(train, 'class') print("Naive Bayes edges: ", model.edges()) #make predictions Y_pred = model.predict(test) #Convert Labels so we can use sklearn function to evaluate our model labelencoder = LabelEncoder() Y_test = labelencoder.fit_transform(Y_test.values.ravel()) Y_pred = labelencoder.fit_transform(Y_pred.values.ravel()) # Output results accuracy = accuracy_score(Y_test, Y_pred) precision = precision_score(Y_test, Y_pred) f1 = f1_score(Y_test, Y_pred)