class TestNode(unittest.TestCase): def setUp(self): self.df = creatData() self.node = Node(self.df) def test_entropy(self): h = self.node._entropy(self.df) self.assertTrue(h< 0.98 and h>0.97) #h=0.971 def test_conditionalEntropy(self): h_age = self.node.conditionalEntropy('age') self.assertTrue(h_age > 0.083 and h_age < 0.084) #h_age = 0.083 h_work = self.node.conditionalEntropy('work') self.assertTrue(h_work >0.32 and h_work < 0.33) #h_work = 0.324 h_house = self.node.conditionalEntropy('house') #h=0.419 self.assertTrue(h_house>0.41 and h_house<0.42) h_credit = self.node.conditionalEntropy('credit') #h=0.363 self.assertTrue(h_credit > 0.362 and h_credit < 0.364) def test_findBestFeature(self): feature = self.node.findBestFeature() self.assertTrue(feature =='house') def test_getDadaFrame(self): df = self.node.getDadaFrame('age', 1) df2 = self.node.getDadaFrame('age', 2)
def train(self, X_data = None, y_data = None): if X_data is None and y_data is None: if self.X_data is None: raise("No data loaded") else: self.load(X_data, y_data) # init root node and start training #print("Training Tree Model. Please Wait...") self.root_node = Node(random_feat = self.random, tree=self) if self.lookahead: self.root_node.train_lookahead(self.X_data, self.y_data, self.max_depth) else: self.root_node.train(self.X_data, self.y_data, self.max_depth)
def get_random_split(data, is_categorical, n_features): # go over all attributes classes = list(set(row[-1] for row in data)) ind, split, best_gini, best_groups = 0, 0, 1, [] num_features = len(data[0]) - 1 sub = np.random.choice(range(num_features), n_features, replace=False) for att in sub: # print('checking ', att) att_vals = list(set(row[att] for row in data)) mini = min(att_vals) maxi = max(att_vals) if (mini == maxi): continue if att_vals == [0, 1]: for i in [0, 1]: groups = binary_split(att, i, data) gini = gini_index(groups, classes) if gini < best_gini: ind, split, best_gini, best_groups = att, i, gini, groups continue for i in np.arange(mini, maxi, (maxi - mini) / 100): # print(i) if att in is_categorical: groups = categorical_split(att) else: groups = binary_split(att, i, data) gini = gini_index(groups, classes) if gini < best_gini: ind, split, best_gini, best_groups = att, i, gini, groups return Node(ind, split, best_groups) # might just need attribute and splitting value
def test_find_best_split(self): tree = DecisionTreeRegressor() tree.target_class = 'V1' node = Node(data=self.data) split = tree.find_best_split(node) print(split) self.assertTupleEqual(('in', 1, ['5', '2', '3']), split[0]) self.assertAlmostEqual(32.166666666, split[1])
def test_find_next_partition(self): tree = DecisionTreeRegressor(max_leaf_nodes=3) tree.target_class = 'V1' #(('in', 1, ['5', '2', '3']), 32.16666666666667) file = open("data/bank-marketing.arff") data = Data(file) node = Node(data=data) node.is_leaf = True tree.root = node change, next_node = tree.find_best_node_to_split(node) print(change, next_node) tree.partition(next_node) print(tree.n_leaves) tree.root.left_child.data.summary() tree.root.right_child.data.summary() tree.root.right_child.left_child.data.summary() tree.root.right_child.right_child.data.summary()
def build_tree(self, data, depth): # check all the classes targets = list(row[-1] for row in data) if len(set(targets)) == 1 or depth >= self.max_depth or len(targets) == 0: # pure leaf print(depth) clas = max(set(targets), key=targets.count) return Node(0, 0, [], clas) root = get_random_split(data, [], self.n_features) for children in root.training_groups: # data in the children of the root child_node = self.build_tree(children, depth + 1) root.add_child(child_node) return root
def test_train_node(self): features = [ { 1: 1, 2: 1, 3: 1 }, { 1: 1, 2: 1, 3: 2 }, { 1: 2, 2: 1, 3: 1 }, { 1: 3, 2: 2, 3: 1 }, { 1: 3, 2: 3, 3: 1 }, { 1: 3, 2: 3, 3: 2 }, ] labels = [0, 0, 1, 1, 1, 0] ds = Dataset(features, labels) root = Node(ds) tree = DecisionTree(max_depth=100) root.train([1, 2, 3], 1, 100) print(tree)
def test_tree_gen(state, aicolor, depthLim, heuristic): def tree_gen(node): if (node.depth < node.depthLim): node.genNextMoves() if (len(node.nextMoves) == 0): endStateCheck(node, None) else: for n in (node.nextMoves): tree_gen(n) #pytest.set_trace() tempNode = Node(aicolor, state, 0, depthLim, heuristic) tree_gen(tempNode) assert 1
def setUp(self): self.df = creatData() self.node = Node(self.df)
# Array math import numpy as np # Reading the data d = pd.read_csv("data/classification/train.csv")[['Age', 'Fare', 'Survived']].dropna() # Constructing the X and Y matrices X = d[['Age', 'Fare']] Y = d['Survived'].values.tolist() # Constructing the parameter dict hp = {'max_depth': 4, 'min_samples_split': 50} # Initiating the Node root = Node(Y, X, **hp) # Getting teh best split root.grow_tree() # Using the ML package clf = DecisionTreeClassifier(**hp) clf.fit(X, Y) # Printing out the trees root.print_tree() print(export_text(clf, feature_names=['Age', 'Fare'])) # Predictions X['scikit_learn'] = clf.predict(X[['Age', 'Fare']]) X['custom_yhat'] = root.predict(X[['Age', 'Fare']])
def fit(self, features, attributes, prev_value, label_set, current_depth, max_depth, rand_attribute_size=None): """train Id3 decision tree :param features: ordered features from dataset :type features: python list containing Feature objects :param attributes: attributes for current fit iteration :type attributes: python tuple containing Attribute objects :param prev_value: attribute value of previous adjacent node :type prev_value: integer or None :param label_set: ordered labels from dataset :type label_set: python tuple containing possible integer labels :param current_depth: current tree depth :type current_depth: integer :param max_depth: maximum desired tree depth :type max_depth: integer or float :param rand_attribute_size: size of desired random attribute subset if not None :type rand_attribute_size: integer or None :return: root node of decision tree :rtype: Node.Node """ if current_depth > self.max_height: self.max_height = current_depth if current_depth == max_depth: label = get_most_common_label(features) return Node.Node(None, prev_value, label) same_label = 1 base_label = features[0].get_label() for example in features: if example.get_label() != base_label: same_label = 0 break if same_label == 1: return Node.Node(None, prev_value, base_label) if len(attributes) == 0: label = get_most_common_label(features) return Node.Node(None, prev_value, label) if rand_attribute_size is not None: indices = random.sample(range(0, len(attributes)), min(rand_attribute_size, len(attributes))) random_attributes = [] for index in indices: random_attributes.append(attributes[index]) attribute_to_split_on = Metrics.get_splitting_attribute( features, random_attributes, label_set, self.metric) else: attribute_to_split_on = Metrics.get_splitting_attribute( features, attributes, label_set, self.metric) # Make root node node = Node.Node(attribute_to_split_on, prev_value, None) # Construct S_v for attribute_value in attribute_to_split_on.values: examples_less_split_attribute = [] for example in features: if example.get_attribute_value( attribute_to_split_on) == attribute_value: examples_less_split_attribute.append(example) # If S_v is empty, add leaf node containing most common label of S if len(examples_less_split_attribute) == 0: node.add_child( Node.Node(None, attribute_value, get_most_common_label(features))) else: less_attributes = list(copy.deepcopy(attributes)) less_attributes.remove(attribute_to_split_on) node.add_child( self.fit(examples_less_split_attribute, less_attributes, attribute_value, label_set, current_depth + 1, max_depth, rand_attribute_size)) if prev_value is None: self.root = node else: return node
class Tree: """The structure of the tree""" def __init__(self, criterion = "entropy", max_depth = None, lookahead = False, random_feat=False, PFSRT=False, omega = 1.9, theta = 0.9): if random_feat and lookahead: raise Exception("random and lookahead cannot coexist in the same tree") if PFSRT and lookahead: raise Exception("random and PFSRT cannot coexist in the same tree") self.criterion = criterion self.max_depth = max_depth self.lookahead = lookahead self.X_data = None self.y_data = None self.root_node = None self.random = random_feat self.nr_features = 0 self._nr_examples = 0 # PFSRT variables self.is_PFSRT = PFSRT self._best_accuracy = 0 self._cur_accuracy = 0 self.omega = omega # reward self.theta = theta # punish # Probabilistic Feature Selection Random Tree (PFSRT) # DS = Depth Score # PS = Prior Score self.DS = None self.PS = None def load(self, X_data, y_data): #print(X_data) self.X_data = X_data self.y_data = y_data self.nr_features = X_data.shape[1] self._nr_examples = X_data.shape[0] # updated data means resetting PFSRT self._best_accuracy = 0 self._cur_accuracy = 0 if self.is_PFSRT: self.DS = np.ones((self.nr_features, self.max_depth)) self.PS = np.ones((self.nr_features, self.nr_features+1)) def fit(self, X_data, y_data): self.train(X_data, y_data) def train(self, X_data = None, y_data = None): if X_data is None and y_data is None: if self.X_data is None: raise("No data loaded") else: self.load(X_data, y_data) # init root node and start training #print("Training Tree Model. Please Wait...") self.root_node = Node(random_feat = self.random, tree=self) if self.lookahead: self.root_node.train_lookahead(self.X_data, self.y_data, self.max_depth) else: self.root_node.train(self.X_data, self.y_data, self.max_depth) def updatePFSRT(self): if not self.is_PFSRT: raise Exception("Must enable PFSRT=True") # test over the training data and update DS and PS y_pred = self.predict(self.X_data) self._cur_accuracy = accuracy_score(y_pred, self.y_data) # update DS and PS self.root_node.recursiveUpdatePFSRT() if self._cur_accuracy > self._best_accuracy: self._best_accuracy = self._cur_accuracy def predict(self, X_data): result = [] for i in X_data: result.append(self.root_node.predictData(i)) return np.array(result) def isBinaryClassifier(self): return len(np.unique(self.y_data))==2 def getClassProb(self, X_data): if not self.isBinaryClassifier(): raise Exception("classification must be binary for getClassProb") result = [] for i in X_data: result.append(self.root_node.getPositiveProb(i)) return result def printTree(self): self.root_node.printNode()