class TestNaiveBayesFit(unittest.TestCase): def setUp(self): self.model1 = NaiveBayes() self.model2 = NaiveBayes([('A','B')]) def test_fit_model_creation(self): values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 5)), columns=['A', 'B', 'C', 'D', 'E']) self.model1.fit(values, 'A') six.assertCountEqual(self, self.model1.nodes(), ['A', 'B', 'C', 'D', 'E']) six.assertCountEqual(self, self.model1.edges(), [('A', 'B'), ('A', 'C'), ('A', 'D'), ('A', 'E')]) self.assertEqual(self.model1.parent_node, 'A') self.assertSetEqual(self.model1.children_nodes, {'B','C','D','E'}) self.model2.fit(values) six.assertCountEqual(self, self.model1.nodes(), ['A', 'B', 'C', 'D', 'E']) six.assertCountEqual(self, self.model1.edges(), [('A', 'B'), ('A', 'C'), ('A', 'D'), ('A', 'E')]) self.assertEqual(self.model2.parent_node, 'A') self.assertSetEqual(self.model2.children_nodes, {'B','C','D','E'}) def test_fit_model_creation_exception(self): values = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 5)), columns=['A', 'B', 'C', 'D', 'E']) values2 = pd.DataFrame(np.random.randint(low=0, high=2, size=(1000, 3)), columns=['C', 'D', 'E']) self.assertRaises(ValueError, self.model1.fit, values) self.assertRaises(ValueError, self.model1.fit, values2) self.assertRaises(ValueError, self.model2.fit, values2, 'A') def tearDown(self): del self.model1 del self.model2
row_size = data.shape[0] random_indices = sample(range(row_size), 2000) smallDF = data.iloc[random_indices, :] smallDF.shape PseudoCounts = {} #Pseudocounts are given (1,1) for uniform for productName in smallDF.columns: PseudoCounts[productName] = [1, 1] DictOfModels = {} Edges = {} Nodes = {} CPD = {} for productName in smallDF.columns: print('Building model for {0}'.format(productName)) model = NaiveBayes() model.fit(smallDF, productName) DictOfModels[productName] = model #Save edge ,node, CPD information Edges[productName] = model.edges() Nodes[productName] = model.nodes() CPD[productName] = model.get_cpds() with open("Edges.txt", "wb") as fp: pickle.dump(Edges, fp) with open("Nodes.txt", "wb") as fp: pickle.dump(Nodes, fp) with open("CPD.txt", "wb") as fp: pickle.dump(CPD, fp) with open("RandomColumns.txt", "wb") as fp:
from sklearn.metrics import f1_score from sklearn.preprocessing import LabelEncoder col_names = pd.read_csv('data/names.csv') # 'data/names.csv' data = pd.read_csv('data/breast-cancer-wisconsin.data', names=col_names.columns) data = data[data["bare_nuclei"] != '?'] data.set_index('id', inplace=True) #stop the model from using id as a node train, test = train_test_split(data, test_size=0.2, random_state=0) Y_test = test['class'] test = test.drop(['class'], axis=1) #fit model model = NaiveBayes() model.fit(train, 'class') print("Naive Bayes edges: ", model.edges()) #make predictions Y_pred = model.predict(test) #Convert Labels so we can use sklearn function to evaluate our model labelencoder = LabelEncoder() Y_test = labelencoder.fit_transform(Y_test.values.ravel()) Y_pred = labelencoder.fit_transform(Y_pred.values.ravel()) # Output results accuracy = accuracy_score(Y_test, Y_pred) precision = precision_score(Y_test, Y_pred) f1 = f1_score(Y_test, Y_pred) print({"Accuracy": accuracy, "Precision": precision, "F1 Score": f1})
# Split the data to test and train test_size = 0.33 print("\nSplitting in to training and test data using: Test size = ", test_size) data_train, data_test = train_test_split(df, test_size=test_size) print("training data:", len(data_train)) print("test data:", len(data_test)) ################################################################################# ##### Defining the model ################################################################################# model = NaiveBayes() # Learning CPDs using Maximum Likelihood Estimators model.fit(data_train, 'class', estimator=MaximumLikelihoodEstimator) # Print the CPDs learned print("\n\n............Overview of our CPDs from the fit...........:") for cpd in model.get_cpds(): print("CPD of {variable}:".format(variable=cpd.variable)) print(cpd) ################################################################################# ##### Using the model to query ################################################################################# # Doing exact inference using Variable Elimination model_infer = VariableElimination(model) # Computing the probability of class given sex # print("\n\n............Here are some queries...............") # q1 = model_infer.query(variables=['class'], evidence={'sex':0}) # print(q1['class'])