def testClassifierFromDf(self): csvfile = self.agrumSrcDir('miniasia.csv') df_asia = pd.read_csv(csvfile) asia_target_column = 'lung_cancer' x_train_asia = df_asia[:9000].drop(asia_target_column, axis=1) y_train_asia = df_asia[:9000][asia_target_column] x_test_asia = df_asia[-1000:].drop(asia_target_column, axis=1) y_test_asia = df_asia[-1000:][asia_target_column] classif2 = skbn.BNClassifier() classif2.fit(x_train_asia, y_train_asia) self.assertEqual(classif2.bn.size(), 8) self.assertEqual(classif2.target, asia_target_column) self.assertTrue(classif2.threshold <= 1) yproba = classif2.predict_proba(x_test_asia) self.assertEqual(yproba.shape, (299, 2)) self.assertEqual(yproba[0].sum(), 1) ypred = classif2.predict(x_test_asia) self.assertEqual(ypred.shape, (299, )) self.assertIn(ypred[0], [0, 1]) self.assertGreater(classif2.MarkovBlanket.size(), 0) classif3 = skbn.BNClassifier() classif3.fit(data=csvfile, targetName="lung_cancer") self.assertEqual(classif3.bn.size(), 8) self.assertEqual(classif3.target, asia_target_column) self.assertTrue(classif3.threshold <= 1) df = pd.read_csv(csvfile) classif4 = skbn.BNClassifier() classif4.fit(data=df, targetName="lung_cancer") self.assertEqual(classif4.bn.size(), 8) self.assertEqual(classif4.target, asia_target_column) self.assertTrue(classif4.threshold <= 1) # some instantiation of parents are missing : No prior should lead to division by 0 classif3 = skbn.BNClassifier(aPriori="NoPrior") with self.assertRaises(gum.DatabaseError): classif3.fit(x_train_asia, y_train_asia)
def test_with_discretization(self): X = pd.DataFrame([ [1, 1.5, "A", True], [2, 2.6, "B", False], [3, 3.14, "B", True], [1, 0.5, "A", False], [1, 0.15, "A", True], ]) y = [3, 2, 3, 1, 2] classifier = skbn.BNClassifier(discretizationThreshold=3, discretizationNbBins=3) classifier.fit(X, y) res = classifier.preparedData(X, y) self.assertEquals(res["X1"][1], "[2.23333;3.14)") self.assertEquals(str(res["X3"][3]), "False") X = pd.DataFrame([ [1, 0, "A", True], [1, 4, "B", False], [2, 3.11, "B", True], [2, 0.5, "A", False], [3, 0.15, "A", True], [3, 203, "A", True], ]) y = [3, 2, 3, 1, 2, 1] res = classifier.preparedData(X, y) self.assertEquals(res["X1"][0], "(0.15;0.833333[") self.assertEquals(str(res["X3"][2]), "True")
def test_with_nparray(self): iris = datasets.load_iris() X = iris.data[:, 0: 2] # we only take the first two features for visualization y = iris.target classifier = skbn.BNClassifier(discretizationThreshold=3, discretizationNbBins=3) classifier.fit(X, y) res = classifier.preparedData(X, y) # X0 and X1 are discretized so the labels should start with '[' but the rest is random (chosen by load_iris)... self.assertEquals(res["x0"][149][0], "[") self.assertEquals(res["x1"][149][0], "[")
def testFitFromCsv(self): csvfile = self.agrumSrcDir('miniasia.csv') asia_target_column = 'lung_cancer' classif1 = skbn.BNClassifier() classif1.fit(data=csvfile, targetName=asia_target_column) self.assertEqual(classif1.bn.size(), 8) self.assertEqual(classif1.target, asia_target_column) self.assertTrue(classif1.threshold <= 1) self.assertGreater(classif1.MarkovBlanket.size(), 0)
def test_no_discretization(self): X = pd.DataFrame([ [1, 1.5, "A", True], [2, 2.6, "B", False], [3, 3.14, "B", True], [1, 0.5, "A", False], [1, 0.15, "A", True], ]) y = [3, 2, 3, 1, 2] classifier = skbn.BNClassifier() classifier.fit(X, y) res = classifier.preparedData(X, y) self.assertEquals(str(res["X1"][1]), "2.6") self.assertEquals(str(res["X3"][3]), "False") X = pd.DataFrame([[1, 0, "A", True]]) y = [3] with self.assertRaises(gum.OutOfBounds): res = classifier.preparedData(X, y)
def test_with_file(self): classifier = skbn.BNClassifier() classifier.fit(data=self.agrumSrcDir("miniasia.csv"), targetName="dyspnoea") res = classifier.preparedData(data=self.agrumSrcDir("miniasia.csv")) self.assertEquals(str(res["lung_cancer"][0]), "0")
def _computepoints(bn, csv_name, target, label, show_progress=True, with_labels=True, significant_digits=10): """ Compute the ROC curve points. Parameters ---------- bn : pyAgrum.BayesNet a Bayesian network csv_name : str a csv filename target : str the target label : str the target's label show_progress : bool indicates if the resulting curve must be printed significant_digits: number of significant digits when computing probabilities Returns ------- tuple (res, totalP, totalN) where res is a list of (proba,isWellClassified) for each line of csv_name. """ idTarget = bn.idFromName(target) label = str(label) if not with_labels: idLabel = -1 for i in range(bn.variable(idTarget).domainSize()): if bn.variable(idTarget).label(i) == label: idLabel = i break assert idLabel >= 0 else: idLabel = label Classifier = skbn.BNClassifier(significant_digit=significant_digits) if show_progress: # tqdm is optional: # pylint: disable=import-outside-toplevel from tqdm import tqdm pbar = tqdm(total=_lines_count(csv_name) - 1, desc=csv_name, bar_format='{desc}: {percentage:3.0f}%|{bar}|') Classifier.fromTrainedModel(bn, target, idLabel) # as a Binary classifier, y will be a list of True (good classification) and False (bad one) X, y = Classifier.XYfromCSV(csv_name, with_labels=with_labels, target=target) predictions = Classifier.predict_proba(X) totalP = np.count_nonzero(y) totalN = len(y) - totalP res = [] for i in range(len(X)): px = predictions[i][1] res.append((px, y[i])) if show_progress: pbar.update() if show_progress: pbar.close() return res, totalP, totalN