def test_build_tree(self): tree_str = """ <Tree name = "test"> <node var ="Gender"> <edge var ="Female" num="2"> <node var = "Bush Approval"> <edge var = "Approve" num="2" > <decision end = "2" choice = "McCain" p = "0.9"/> </edge> <edge var = "Disapprove" num="1"> <decision end = "1" choice="Obama" p = "0.95"/> </edge> </node> </edge> <edge var = "Male" num="1"> <node var = "Ideology"> <edge var = "Liberal" num = "1"> <decision end = "1" choice ="Obama" p = "0.99"/> </edge> <edge var = "Moderate" num="2"> <decision end = "1" choice = "Obama" p = "0.7"/> </edge> <edge var = "Conservative" num ="3"> <decision end = "2" choice = "McCain" p = "0.95"/> </edge> </node> </edge> </node> </Tree> """ root = model.build_tree(tree_str) self.assertEqual(root.name, "Gender") self.assertEqual(len(root.edges.keys()), 2) # Gender = Female node = root.edges['Female'] self.assertEqual(node.name, "Bush Approval") self.assertEqual(len(node.edges.keys()), 2) self.assertEqual(node.edges['Approve'], Label("McCain")) self.assertEqual(node.edges['Disapprove'], Label("Obama")) # Gender = Male node = root.edges['Male'] self.assertEqual(node.name, "Ideology") self.assertEqual(len(node.edges.keys()), 3) self.assertEqual(node.edges['Liberal'], Label("Obama")) self.assertEqual(node.edges['Moderate'], Label("Obama")) self.assertEqual(node.edges['Conservative'], Label("McCain")) tree = Node("Gender", ("Female", Node("Bush Approval", ("Approve", Label("McCain")), ("Disapprove", Label("Obama")))), ("Male", Node("Ideology", ("Liberal", Label("Obama")), ("Moderate", Label("Obama")), ("Conservative", Label("McCain"))))) self.assertEqual(root, tree)
def test_stringify_tree(self): tree = Node("Gender", ("Female", Node("Bush Approval", ("Approve", Label("McCain")), ("Disapprove", Label("Obama")))), ("Male", Node("Ideology", ("Liberal", Label("Obama")), ("Moderate", Label("Obama")), ("Conservative", Label("McCain"))))) xml_tree = model.stringify_tree(tree) self.assertEqual(tree, model.build_tree(xml_tree))
def main(to_classify_csv, decision_tree_xml, restrictionstxt, has_label_column): # how are we supposed to determine if this has a label column or not? # I guess we could look at the number of unique edge labels in decision tree # to determine features/ tree = model.build_tree(decision_tree_xml.read()) restrictions = dataset.restrictions_from_text(restrictionstxt) cols, data = dataset.read(to_classify_csv.read(), has_label_column, restrictions) predicted_classes = [tree.classify(x[0], cols) for x in data] labels = [x[1] for x in data] if has_label_column: print('Records:', len(data)) print('Correctly classified:', sum(1 for p,l in zip(predicted_classes, labels) if p==l)) print('Incorrectly classified:', sum(1 for p,l in zip(predicted_classes, labels) if p!=l)) print('Accuracy:', sampling.accuracy(labels, predicted_classes)) print('Error:', sampling.error_rate(labels, predicted_classes)) print('Confusion matrix:') print(sampling.confusion_matrix(labels, predicted_classes)) else: for i in range(len(predicted_classes)): print(data[i][0], predicted_classes[i])