def test_random_forest_1(self): test_dataset = Dataset(_dataset_name = 'watermelon_2.0', _dataset_file_path = './datasets/watermelon.csv') test_dataset.load_dataset(verbose=False) test_random_forest = RandomForest(10) test_random_forest.set_dataset(test_dataset) test_random_forest.generate_random_forest()
def test_select_best_attribute_to_split(self): test_tree_node = TreeNode() test_dataset = Dataset(_dataset_name = 'watermelon_2.0', _dataset_file_path = './datasets/watermelon.csv') test_dataset.load_dataset(verbose=False) test_tree_node.set_samples(test_dataset.samples) test_tree_node.set_attribute_list(list(range(6))) best_attribute_to_split = test_tree_node.select_best_attribute_to_split() self.assertEqual(best_attribute_to_split,3)
def test_get_accuracy(self): test_dataset = Dataset(_dataset_name = 'watermelon_3.0', _dataset_file_path = './datasets/watermelon2.csv') test_dataset.load_dataset(verbose=False) test_random_forest = RandomForest(10) test_random_forest.set_dataset(test_dataset) test_random_forest.generate_random_forest() (test_random_forest.get_accuracy(test_dataset.samples))
def test_predict_batch(self): test_dataset = Dataset(_dataset_name = 'watermelon_3.0', _dataset_file_path = './datasets/watermelon2.csv') test_dataset.load_dataset(verbose=False) test_random_forest = RandomForest(10) test_random_forest.set_dataset(test_dataset) test_random_forest.generate_random_forest() predicted_result = test_random_forest.predict_batch(test_dataset.samples)
def test_split_by_attribute_internal(self): test_tree_node = TreeNode() test_dataset = Dataset(_dataset_name = 'watermelon_2.0', _dataset_file_path = './datasets/watermelon.csv') test_dataset.load_dataset(verbose=False) test_tree_node.set_samples(test_dataset.samples) test_tree_node.set_attribute_list(list(range(6))) attribute_values_samples_mapping_dict = test_tree_node.split_by_attribute_internal(3) self.assertEqual(attribute_values_samples_mapping_dict[0].shape[0],9) self.assertEqual(attribute_values_samples_mapping_dict[1].shape[0],5) self.assertEqual(attribute_values_samples_mapping_dict[2].shape[0],3)
def test_generate_decision_tree_continuous(self): test_dataset = Dataset(_dataset_name='watermelon_3.0', _dataset_file_path='./datasets/watermelon2.csv') test_dataset.load_dataset(verbose=False) test_decision_tree = DecisionTree() test_decision_tree.set_training_samples_root(test_dataset.samples) test_decision_tree.set_attributes_list( list(range(test_dataset.num_features))) test_decision_tree.set_attributes_list( test_dataset.feature_category_list) decision_tree_root = test_decision_tree.generate_decision_tree( test_decision_tree.training_samples_root, test_decision_tree.attributes_list)
def test_generate_decision_tree(self): test_dataset = Dataset(_dataset_name='watermelon_2.0', _dataset_file_path='./datasets/watermelon.csv') test_dataset.load_dataset(verbose=False) test_decision_tree = DecisionTree() test_decision_tree.set_training_samples_root(test_dataset.samples) test_decision_tree.set_attributes_list( test_dataset.feature_category_list) decision_tree_root = test_decision_tree.generate_decision_tree( test_decision_tree.training_samples_root, test_decision_tree.attributes_list) self.assertEqual( len(decision_tree_root.child_node_list[0].child_node_list[1]. child_node_list), 3)
def test_vis_tree_1(self): test_dataset = Dataset(_dataset_name='watermelon_2.0', _dataset_file_path='./datasets/watermelon.csv') test_dataset.load_dataset(verbose=False) test_decision_tree = DecisionTree() test_decision_tree.set_training_samples_root(test_dataset.samples) test_decision_tree.set_attributes_list( test_dataset.feature_category_list) decision_tree_root = test_decision_tree.generate_decision_tree( test_decision_tree.training_samples_root, test_decision_tree.attributes_list) test_decision_tree.set_root(decision_tree_root) test_vis_tree = VisTree(test_decision_tree,test_dataset.feature2number_mapping,\ test_dataset.feature_name_list,_tree_name="test_decision_tree") test_vis_tree.vis_tree(mode=1)
def test_generate_random_decision_tree_2(self): test_dataset = Dataset(_dataset_name='watermelon_3.0', _dataset_file_path='./datasets/watermelon2.csv') test_dataset.load_dataset(verbose=False) test_decision_tree = DecisionTree() test_decision_tree.set_training_samples_root(test_dataset.samples) test_decision_tree.set_attributes_list( test_dataset.feature_category_list) decision_tree_root_1 = test_decision_tree.generate_decision_tree( test_decision_tree.training_samples_root, test_decision_tree.attributes_list, random_state=1) decision_tree_root_2 = test_decision_tree.generate_decision_tree( test_decision_tree.training_samples_root, test_decision_tree.attributes_list, random_state=2)
def test_get_all_possible_values_on_attribute(self): test_dataset = Dataset(_dataset_name='watermelon_2.0', _dataset_file_path='./datasets/watermelon.csv') test_dataset.load_dataset(verbose=False) test_decision_tree = DecisionTree() test_decision_tree.set_training_samples_root(test_dataset.samples) test_decision_tree.set_attributes_list( test_dataset.feature_category_list) self.assertEqual( test_decision_tree.get_all_possible_values_on_attribute((0, 0)), [0, 1, 2]) self.assertEqual( test_decision_tree.get_all_possible_values_on_attribute((1, 0)), [0, 1, 2]) self.assertEqual( test_decision_tree.get_all_possible_values_on_attribute((5, 0)), [0, 1])
def test_get_ent(self): test_tree_node = TreeNode() ent1 = test_tree_node.get_ent(np.ones((4,5))) self.assertEqual(ent1,0) ent2 = test_tree_node.get_ent(np.zeros((4,5))) self.assertEqual(ent2,0) input_np = np.ones((4,5)) input_np[0][4] = 0 input_np[1][4] = 0 ent2 = test_tree_node.get_ent(input_np) self.assertEqual(ent2,1) test_dataset = Dataset(_dataset_name = 'watermelon_2.0', _dataset_file_path = './datasets/watermelon.csv') test_dataset.load_dataset(verbose=False) ent3 = test_tree_node.get_ent(test_dataset.samples) self.assertAlmostEqual(ent3,0.9975025463691152)
def test_random_forest_3(self): test_dataset = Dataset(_dataset_name = 'uci_blood', _dataset_file_path = './datasets/uci_blood.csv') test_dataset.load_dataset(verbose=False) test_random_forest = RandomForest(n_estimators = 20,n_samples=400) test_random_forest.set_dataset(test_dataset) import time start = time.clock() test_random_forest.generate_random_forest() end = time.clock() print((end-start)/20.0) """ i = 0 for tree in test_random_forest.forest: test_vis_tree = VisTree(tree,test_dataset.feature2number_mapping,\ test_dataset.feature_name_list,_tree_name="test_3_random_decision_tree_%d" %(i)) test_vis_tree.vis_tree(mode=1) i += 1 """ print(test_random_forest.calculate_out_of_bag_error())
def test_decision_tree_predict(self): test_dataset = Dataset(_dataset_name='watermelon_3.0', _dataset_file_path='./datasets/watermelon2.csv') test_dataset.load_dataset(verbose=False) test_decision_tree = DecisionTree() test_decision_tree.set_training_samples_root(test_dataset.samples) test_decision_tree.set_attributes_list( test_dataset.feature_category_list) decision_tree_root = test_decision_tree.generate_decision_tree( test_decision_tree.training_samples_root, test_decision_tree.attributes_list) test_decision_tree.set_root(decision_tree_root) for i in range((test_dataset.num_samples)): test_sample = test_dataset.samples[i, :] test_X = test_sample[0:-1] test_y = test_sample[-1] test_predicted_label = test_decision_tree.predict( test_sample=test_X) self.assertEqual(test_predicted_label, test_y)
def test_random_forest_predict(self): test_dataset = Dataset(_dataset_name = 'watermelon_3.0', _dataset_file_path = './datasets/watermelon2.csv') test_dataset.load_dataset(verbose=False) test_random_forest = RandomForest(10) test_random_forest.set_dataset(test_dataset) test_random_forest.generate_random_forest() # i = 0 # for tree in test_random_forest.forest: # test_vis_tree = VisTree(tree,test_dataset.feature2number_mapping,\ # test_dataset.feature_name_list,_tree_name="test_2_random_decision_tree_%d" %(i)) # test_vis_tree.vis_tree(mode=1) # i += 1 for i in range((test_dataset.num_samples)): test_sample = test_dataset.samples[i,:] test_X = test_sample[0:-1] test_y = test_sample[-1] test_predicted_label = test_random_forest.predict(test_sample=test_X)
import graphviz # dot_data = tree.export_graphviz(clf, out_file="./random_forests_sklearn/vis/iris") # graph = graphviz.Source(dot_data) # graphviz.render(engine="dot",format="pdf",filepath="./random_forests_sklearn/vis/iris") from random_forests.utils import Dataset # test_dataset = Dataset(_dataset_name = 'watermelon_2.0', _dataset_file_path = './datasets/watermelon.csv') # test_dataset.load_dataset(verbose=False) # X = test_dataset.samples[:,:-1].astype(int) # y = test_dataset.labels.astype(int) # clf = tree.DecisionTreeClassifier(criterion="entropy") # clf = clf.fit(X,y) test_dataset = Dataset(_dataset_name='uci_blood', _dataset_file_path='./datasets/uci_blood.csv') test_dataset.load_dataset(verbose=False) X = test_dataset.samples[:, :-1].astype(int) y = test_dataset.labels.astype(int) clf = tree.DecisionTreeClassifier(criterion="entropy") stupid_clf = DummyClassifier(strategy='uniform') RF_clf = RandomForestClassifier(n_estimators=200) clf = clf.fit(X, y) RF_clf = RF_clf.fit(X, y) aver_list = cross_val_score(clf, X, y, cv=5, scoring='accuracy') aver_list2 = cross_val_score(stupid_clf, X, y, cv=5, scoring='accuracy') aver_list3 = cross_val_score(RF_clf, X, y, cv=5, scoring='accuracy') print(aver_list) print(aver_list2)
def main(): test_dataset = Dataset(_dataset_name='watermelon_2.0', _dataset_file_path='./datasets/watermelon.csv') test_dataset.load_dataset() myDecisionTree = DecisionTree()