def test_mean_squared_error(self): tree = DecisionTreeRegressor() tree.target_class = 'V1' #split = ('<=', [2, 3, 4, 5, 6, 10]) mse = tree.mean_squared_error(self.data) print(mse) self.assertAlmostEqual(53.5, mse)
def test_find_best_split(self): tree = DecisionTreeRegressor() tree.target_class = 'V1' node = Node(data=self.data) split = tree.find_best_split(node) print(split) self.assertTupleEqual(('in', 1, ['5', '2', '3']), split[0]) self.assertAlmostEqual(32.166666666, split[1])
def test_impurity(self): tree = DecisionTreeRegressor() tree.target_class = 'V1' split = ('<=', 2, 5.5) impurity = tree.impurity(split, self.data) print(impurity) expected = 39.4375 self.assertAlmostEqual(impurity, expected)
def test_create_numerical_split(self): tree = DecisionTreeRegressor() tree.target_class = 'Class' attribute_index = 0 splits = tree.create_numeric_split(self.data, attribute_index) # sorted: 1, 2, 3, 5, 13, 13, 17, 22 expected = [1.5, 2.5, 4, 9, 13, 15, 19.5] print(splits) self.assertListEqual(splits, expected)
def test_018(self): tree = DecisionTreeRegressor() tree.target_class = 'V1' file = open("data/bank-marketing.arff") data = Data(file) #data.summary() #node = Node(data = data) #split = tree.find_best_split(node) #print(split) file.close()
def test_generate_candidate_splits_for_numeric(self): tree = DecisionTreeRegressor() tree.target_class = 'Class' attribute_index = 2 candidate_splits = tree.generate_candidate_splits( attribute_index, self.data) # sorted: 1, 1, 3, 5, 6, 8, 10, 20 question_type = '<=' expected = (question_type, [1, 2, 4, 5.5, 7, 9, 15]) print(candidate_splits) self.assertTupleEqual(candidate_splits, expected)
def test_create_categorical_split(self): tree = DecisionTreeRegressor() tree.target_class = 'V3' attribute_index = 1 splits = tree.create_categorical_split(self.data, attribute_index) # '1' => 4 # '2' => 15 # '3' => 1 # '4' => no instance # '5' => 7 print(splits) expected = [['3'], ['3', '1'], ['3', '1', '5']] self.assertListEqual(splits, expected)
def test_generate_candidate_splits_for_categorical(self): tree = DecisionTreeRegressor() tree.target_class = 'V1' attribute_index = 1 candidate_splits = tree.generate_candidate_splits( attribute_index, self.data) # '1' => (13 + 22) / 2 = 17.5 # '2' => 9 # '3' => 9 # '4' => no instances # '5' => 2.5 question_type = 'in' expected = (question_type, [['5'], ['5', '2'], ['5', '2', '3']]) print(candidate_splits) self.assertTupleEqual(candidate_splits, expected)
def fit(self, X, y): X_ = self.__get_values(X) y_ = self.__get_values(y) self.regressors_ = [] self.unique_outputs_ = np.unique(y_) self.n_outputs_ = len(self.unique_outputs_) estimations = np.zeros((self.n_outputs_, len(y_))) self.loss_ = np.inf stable_loss_count = 0 for _ in range(self.n_learners): prob_exp = np.exp(estimations) probs = prob_exp / (np.sum(prob_exp, axis=0)) curr_regressor_group = [] if self.n_iters_stop: curr_loss = self._loss(y_, probs) if abs(curr_loss - self.loss_) <= self.loss_tol: stable_loss_count += 1 if stable_loss_count == self.n_iters_stop: self.loss_ = curr_loss break else: stable_loss_count = 0 self.loss_ = curr_loss for k, y_k in enumerate(self.unique_outputs_): model = DecisionTreeRegressor(tol=self.tol, max_depth=self.max_depth, min_members=self.min_members, criterion='mse', split_method=self.split_method, max_features=self.max_features) y_i = y_ == y_k residuals = y_i - probs[k] model.fit(X_, residuals) res_pred = self.__eval_leaves(model.tree_, X_, residuals) estimations[k] += res_pred * self.alpha curr_regressor_group.append(model) self.regressors_.append(curr_regressor_group)
def fit(self, X, y): ''' Parameters ---------- X : matrix of shape = [n_samples, n_features] The matrix consisting of the input data y : array of shape = [n_samples] The label of X ''' self.trees_ = [ DecisionTreeRegressor(self.max_features_num, self.max_depth, self.min_samples_split) for i in range(self.n_trees) ] self.trees_ = Parallel(n_jobs=self.n_processes)( delayed(_parallel_build)(tree, X, y) for i, tree in enumerate(self.trees_))
def test_find_next_partition(self): tree = DecisionTreeRegressor(max_leaf_nodes=3) tree.target_class = 'V1' #(('in', 1, ['5', '2', '3']), 32.16666666666667) file = open("data/bank-marketing.arff") data = Data(file) node = Node(data=data) node.is_leaf = True tree.root = node change, next_node = tree.find_best_node_to_split(node) print(change, next_node) tree.partition(next_node) print(tree.n_leaves) tree.root.left_child.data.summary() tree.root.right_child.data.summary() tree.root.right_child.left_child.data.summary() tree.root.right_child.right_child.data.summary()