def train(self, X): train_x = self.cross_validation_split(X, self.n_trees, self.fold_size) train_x = self.randomize_features(train_x) for fold in train_x: dt = DecisionTree(MAX_DEPTH, MIN_NODE) dt.train(fold) self.trees.append(dt)
def algorithm(self, i): # Prepares bootstrap data according to given bootstrap_size bootstrap_data_x, bootstrap_data_y = self.get_bootstrap_data() tree = DecisionTree(max_depth=self.max_depth, random_subspace=self.n_features) tree.fit(bootstrap_data_x, bootstrap_data_y) return tree
def train(learning_curve: list[float], output_file: str = 'data/saved_trees/AIBasic.csv') -> None: results = [] d_tree = DecisionTree(move=-1, turn='yellow', subtrees=[]) random_player = RandomPlayer() for t in learning_curve: ai = AIPlayerBasic(d_tree, t) moves_played, ai_win = run_game(ai, random_player) if ai_win: moves_played.append(1) else: moves_played.append(0) d_tree.add_game(moves_played) results.append(ai_win) write_to_file(d_tree, output_file) total_win_percent = len([1 for result in results if result])/len(results) recent_wins = 0 if len(results) > 100: flipped_results = results[::-1] for i in range(0, 100): if flipped_results[i]: recent_wins += 1 recent_win_percent = recent_wins/100 print('Recent Win Percentage:', recent_win_percent) print('Total Win Percentage:', total_win_percent)
class Emsemble: def __init__(self, max_depth, models, stump_class=DecisionStumpErrorRate): self.max_depth = max_depth self.stump_class = stump_class self.models = models self.tree_mod = DecisionTree(max_depth, stump_class) def fit(self, X, y): # Fits a decision tree using greedy recursive splitting N, D = X.shape L = len(self.models) Xnew = np.zeros((N, L)) cur_col_ind = 0 for model in self.models: y = model.predict(X) Xnew[:, cur_col_ind] = y cur_col_ind = cur_col_ind self.tree_mod.fit(Xnew, y, self.max_depth, self.stump_class) def predict(self, X): y = self.tree_mod.pred(X) return y
def main(argc, argv): """ entry point to the program """ if argc < 3 or argc > 4: sys.exit( f"Usage python3 {argv[0]} <training_file> <output_dir> <random_features?>" ) _, training_y, training_x = parse_data.read_data(argv[1], skip_header=False, delimiter=",") random_features = None if argc >= 4: random_features = int(argv[3]) num_rows = len(training_y) while True: tree = DecisionTree() rows_to_evaluate = random.choices(range(num_rows), k=num_rows) tree.train(rows_to_evaluate, training_x, training_y, random_features=random_features) filename = f"{argv[2]}/{uuid.uuid4()}.json" with open(filename, "w") as out_file: out_file.write(tree.to_json()) print(filename)
def main(): """ main function """ # build corpus with open("names/female.txt") as textfile: females = textfile.readlines() females = [name.strip() for name in females] with open("names/male.txt") as textfile: males = textfile.readlines() males = [name.strip() for name in males] female_features = [gender_features(name) for name in females] male_features = [gender_features(name) for name in males] shuffle(female_features) shuffle(male_features) train_set = [(feature, "female") for feature in female_features[:4500]] + \ [(feature, "male") for feature in male_features[:2500]] test_set = [(feature, "female") for feature in female_features[4500:]] + \ [(feature, "male") for feature in male_features[2500:]] # feed corpus into the tree! tree = DecisionTree(train_set) print "Trained decision tree (with ID3 heuristic) using {} samples." \ .format(len(train_set)) print "Evaluating accuracy with a test set of {} samples..." \ .format(len(test_set)) accuracy, nones = tree.evaluate(test_set) print "Percent of items classified correctly: {}%" \ .format(round(accuracy * 100, 2)) print "Percent of items not classified: {}%" \ .format(round(nones * 100), 2)
def test_decision_tree(self): decision_tree = DecisionTree() decision_tree.train(self.sample_input, self.sample_output, feature_label=self.feature_labels) self.assertEqual(decision_tree.root.value, "no surfacing") test_input = [1, 1] test_output = decision_tree.predict(test_input) self.assertEqual(test_output, "yes")
def train(self, data, labels): """ Trains the Random Forest by creating and training its constituent trees. Parameters ---------- data : np.array An (n, d) numpy matrix with numerical (float or int) entries. Each row represents a datapoint. labels : np.array An (n,) numpy array. Each entry represents the label for its corresponding datapoint. """ n, f = data.shape num_features = int(self.fraction_features * f) num_datapoints = int(self.fraction_data * n) k = np.max(labels) + 1 for i in range(self.num_trees): subset_of_data_indices = np.random.choice(n, num_datapoints, replace=True) data_for_tree, labels_for_tree = data[ subset_of_data_indices], labels[subset_of_data_indices] ignore_feature_indices = set( np.random.choice(f, f - num_features, replace=False)) tree = DecisionTree(max_depth=self.max_depth, ignore_feature_indices=ignore_feature_indices, cat_feature_indices=self.cat_feature_indices, max_num_thresholds=self.max_num_thresholds) tree.train(data_for_tree, labels_for_tree, None, k) self.trees.append(tree)
def fit_trees(self, i): """ Applying Bootstrap sampling and balancing different classes and creating a new Decision tree classifier and fitting the training data and adds the reference to a dictionary :param i:unique id :return: """ bag_col_sample = self.training_df.sample(frac=.95, replace=False, random_state=int(time.time()), axis=1) balanced_classes = pd.Series([], name='Response', dtype='int32') for col_val in self.training_result.unique(): if self.training_result[self.training_result == col_val].count() >= 1900: balanced_classes = balanced_classes.append(self.training_result[self.training_result == col_val].sample( n=1900, random_state=int(time.time()), replace=False, axis=0)) else: balanced_classes = balanced_classes.append(self.training_result[self.training_result == col_val].sample( n=1200, random_state=int(time.time()), replace=True, axis=0)) bag = bag_col_sample.join(balanced_classes, how='inner') clf = DecisionTree(i, bag, self.min_samples_split, self.max_depth) clf.fit() self.tree_dict[i] = clf
def test_resultant_tree(): example_dataset = pd.read_csv('data/benchmark_dataset.tsv', sep='\t') decision_tree = DecisionTree(classification_attribute='Joga', attribute_types={ 'Tempo': 'discrete', 'Temperatura': 'discrete', 'Umidade': 'discrete', 'Ventoso': 'discrete' }) decision_tree.train(example_dataset) expected_tree = json.dumps({ "('Tempo', 0.247)": { "children": [{ "('Chuvoso', 'Ventoso', 0.971)": { "children": ["('Falso', 'Sim')", "('Verdadeiro', 'Nao')"] } }, { "('Ensolarado', 'Umidade', 0.971)": { "children": ["('Alta', 'Nao')", "('Normal', 'Sim')"] } }, "('Nublado', 'Sim')"] } }) assert decision_tree.to_json() == expected_tree
def run_iteration(self): weak_learner = DecisionTree(self.dataset, self.depth) weak_learner.build() e = 0 errors = [] for row in self.dataset: if weak_learner.predict(row) != row[-2]: e += row[-1] errors.append(1) else: errors.append(0) alpha = 0.5 * log((1 - e) / e) # print 'e=%.2f a=%.2f'%(e, alpha) sum_weights = 0 for i in range(len(self.dataset)): row = self.dataset[i] if errors[i] == 1: row[-1] = row[-1] * exp(alpha) else: row[-1] = row[-1] * exp(-alpha) sum_weights += row[-1] for row in self.dataset: row[-1] /= sum_weights self.weak_learners.append(weak_learner) self.alpha.append(alpha)
def test_fit_predict_classification(self): decision_tree = DecisionTree() data = np.array([[1], [2], [3]]) labels = np.array([0, 0, 1]) decision_tree.fit(data, labels) pred = decision_tree.predict([3.5]) self.assertIn(pred, labels)
def test_fit_predict_regression(self): decision_tree = DecisionTree(task="regression") data = np.array([[1], [2], [3]]) labels = [0.5, 0.25, 1.5] decision_tree.fit(data, np.array(labels)) pred = decision_tree.predict([2.5]) self.assertTrue(isinstance(pred, float))
def fit(self, X, y, n_estimators=10, max_depth=3, min_samples_split=2, max_features=None, n_samples=None): self.trees = [] self.tree_features = [] for _ in range(n_estimators): m = len(X[0]) n = len(y) if n_samples: idx = choices(population=range(n), k=min(n, n_samples)) else: idx = range(n) if max_features: n_features = min(m, max_features) else: n_features = int(m**0.5) features = sample(range(m), choice(range(1, n_features + 1))) X_sub = [[X[i][j] for j in features] for i in idx] y_sub = [y[i] for i in idx] clf = DecisionTree() clf.fit(X_sub, y_sub, max_depth, min_samples_split) self.trees.append(clf) self.tree_features.append(features)
def run(): # Column labels. # These are used only to print the tree. headers = [ "Class Name", "Left-Weight", "Left-Distance", "Right-Weight", "Right-Distance" ] # The Balance Scale Weight & Distance Database # Format: each row is a new data. # The first column is the label. # The last four columns are features. training_data = read_data() decision_tree = DecisionTree(headers, training_data) balance_scale_tree = decision_tree.get_tree() print_tree(balance_scale_tree) # Evaluate # testing_data = read_data() testing_data = [['L', 1, 3, 1, 2], ['B', 2, 3, 3, 2], ['R', 2, 5, 4, 4], ['R', 4, 1, 5, 5], ['L', 5, 3, 1, 1]] for row in testing_data: print('Actual:', row[0], ', Predicted:', print_leaf(classify(row, balance_scale_tree))) print('\nTotal accuracy:', print_accuracy(testing_data, balance_scale_tree))
def __init__(self, X, count, depth, neg, pos, ssf=False, numfeatures=35): self.ssf = ssf if (self.ssf == False): self.trees = [ DecisionTree( get_subsample(X,neg,pos).tolist() , depth) for _ in xrange(count) ] else: self.feature_subset = [ np.random.choice([i for i in xrange(len(X[0])-1)], numfeatures).tolist()+[35] for _ in xrange(count) ] self.trees = [DecisionTree( np.asarray(get_subsample([[point[i] for i in fs] for point in X],neg,pos)) , depth) for fs in self.feature_subset ]
def fit(self, X, y): N = X.shape[0] boostrap_inds = np.random.choice(N, N, replace=True) bootstrap_X = X[boostrap_inds] bootstrap_y = y[boostrap_inds] DecisionTree.fit(self, bootstrap_X, bootstrap_y)
def fit(self, X, Y, w=None, w_asymmetric=None, depth=1, T=100, **kwargs): self.X = X.copy() self.Y = Y.copy() N = len(self.Y) if w is None: w = (1.0/float(N))*numpy.ones(N) if w_asymmetric is None: w_asymmetric = (1.0/float(N))*numpy.ones(N) self.weights = w.copy() self.weights_asymmetric = numpy.array([i**(1.0/float(T)) for i in w_asymmetric]) self.weights /= float(sum(self.weights)) self.weak_classifier_ensemble = [] self.alpha = [] for t in with_progress(range(T), pbar=self.progressbar): # Apply asymmetric weights self.weights *= self.weights_asymmetric weak_learner = DecisionTree().fit(self.X,self.Y,self.weights, depth=depth) Y_pred = weak_learner.predict(self.X) e = sum(0.5*self.weights*abs(self.Y-Y_pred))/sum(self.weights) if e > 0.5: logging.warning(' ending training, no good weak classifiers.') break ee = (1.0-e)/float(e) alpha = 0.5*math.log(ee) # increase weights for wrongly classified: self.weights *= numpy.exp(-alpha*self.Y*Y_pred) self.weights /= sum(self.weights) self.weak_classifier_ensemble.append(weak_learner) self.alpha.append(alpha) return self
class TestDecisionTree(unittest.TestCase): def setUp(self): self.tree = DecisionTree() self.data = [{"male": True, "tall": False, "rich": True, "married": False}, {"male": False, "tall": True, "rich": True, "married": True}, {"male": False, "tall": True, "rich": False, "married": True}, {"male": True, "tall": True, "rich": True, "married": True}, {"male": False, "tall": True, "rich": False, "married": True}] def test_entropy_corectness(self): entropy = self.tree.entropy(self.data, "married") is_within = entropy > 0.721 and entropy < 0.722 is_within.should.be.ok def test_gain_corectness(self): gain = self.tree.gain(self.data, "male", "married") is_within = gain > 0.321 and gain < 0.322 is_within.should.be.ok def test_find_best_attribute(self): attributes = list(self.data[0].keys()) attributes.remove("married") best_attribute = self.tree.find_best_attribute(self.data, attributes, "married") best_attribute.should.eql("tall") def test_grow_decision_tree(self): attributes = list(self.data[0].keys()) attributes.remove("married") root_node = self.tree.grow(self.data, attributes, "married") root_node.label.should.eql("tall")
def test_train_method(self): decision_tree = DecisionTree() decision_tree.train(self.sample_input, self.sample_output, feature_label=self.feature_labels) self.assertIsNotNone(decision_tree.root, 'Decision tree must have a root node')
def training(train_filepath): train_data = Instance.read(train_filepath) algo = Id3() dt = DecisionTree(train_data, algo) path = dt.train() return path
def test_choose_feature_to_split(self): decision_tree = DecisionTree() feature_to_split = decision_tree._select_feature_to_split( self.sample_input, self.sample_output) self.assertEqual( feature_to_split, 0, 'The best feature index to pick is 0, but get %d' % feature_to_split)
def fit(self, X, y): self._classes = np.unique(y) # 各決定木にわたすデータは、元データをブートストラップサンプル (復元ありの抽出) および特徴量をランダムに選択 bootstrapped_X, bootstrapped_y = self._bootstrap_sample(X, y) for i, (i_bootstrapped_X, i_bootstrapped_y) in enumerate(zip(bootstrapped_X, bootstrapped_y)): tree = DecisionTree() tree.fit(i_bootstrapped_X, i_bootstrapped_y) self._forest[i] = tree
def fitting(self): # TODO: Train `num_trees` decision trees using the bootstraps datasets # and labels by calling the learn function from your DecisionTree class. for i in range(0, self.num_trees): decisiontree = DecisionTree() self.decision_trees[i] = decisiontree.learn( self.bootstraps_datasets[i], self.bootstraps_labels[i])
def fit(self, X, y): rs = ShuffleSplit(n_splits=self.forest_size, train_size=0.75, random_state=42) for train_index, _ in rs.split(X): X_cur = X[train_index] y_cur = y[train_index] tree = DecisionTree(self.min_split, self.min_leaf) tree.fit(X_cur, y_cur) self.forest.append(tree)
def fit(self, X, y): self.trees = [] for _ in range(self.n_trees): tree = DecisionTree(min_samples_split=self.min_samples_split, max_depth=self.max_depth, n_feats=self.n_feats) X_samp, y_samp = bootstrap_sample(X, y) tree.fit(X_samp, y_samp) self.trees.append(tree)
def test_train_with_gini(self): algo = Gini() dt1 = DecisionTree(self.instances1, algo) path1 = dt1.train() assert path1 == self.gt1 dt2 = DecisionTree(self.instances2, algo) path2 = dt2.train() assert path2 == self.gt2
def test_info_gain(self): expected_info_gain = 0.14 question = Question(0, 'Green') true_rows, false_rows = DecisionTree._partition( self.training_data, question) current_uncertainty = DecisionTree._gini(self.training_data) info_gain = DecisionTree.info_gain(true_rows, false_rows, current_uncertainty) self.assertAlmostEqual(expected_info_gain, info_gain)
def test_info_gain_with_better_split(self): expected_info_gain = 0.3733333 question = Question(0, 'Red') true_rows, false_rows = DecisionTree._partition( self.training_data, question) current_uncertainty = DecisionTree._gini(self.training_data) info_gain = DecisionTree.info_gain(true_rows, false_rows, current_uncertainty) self.assertAlmostEqual(expected_info_gain, info_gain)
def build_predict_tree(k, max_depth, m, X, y): tree = DecisionTree(sample_features=k, max_depth=max_depth, best_attr_method='gini') sample = np.random.choice(X.shape[0], m, replace=True) tree.fit(X[sample], y[sample]) #self.trees.append(tree) #D.append(tree.predict(X)) return tree
def train(self, X, y): for i in range(self.n_trees): X_train, y_train = self.subsample(X, y, self.sample_size) X_train = self.drop_features(X_train, self.max_features) tree = DecisionTree(max_depth=self.max_depth, split_val_metric=self.split_val_metric, split_node_criterion=self.split_node_criterion, min_info_gain=self.min_info_gain) tree.train(X_train, y_train) self.trees.append(tree) return self
def problem_12(): train_data = get_train_data() test_data = get_test_data() dt = DecisionTree(train_data, None) E_in = get_error_rate(train_data[:, -1], dt.predict_all(train_data[:, :-1])) E_out = get_error_rate(test_data[:, -1], dt.predict_all(test_data[:, :-1])) print('E in: {}'.format(E_in)) print('E out: {}'.format(E_out))
def test_decision_tree(self): decision_tree = DecisionTree() decision_tree.train(self.sample_input, self.sample_output, feature_label=self.feature_labels) self.assertEqual(decision_tree.root.value, 'no surfacing') test_input = [1, 1] test_output = decision_tree.predict(test_input) self.assertEqual(test_output, 'yes')
def fitting(self): # TODO: Train `num_trees` decision trees using the bootstraps datasets # and labels by calling the learn function from your DecisionTree class. for i in range(self.num_trees): # print("training tree #", i) a_tree = DecisionTree() XX = self.bootstraps_datasets[i] y = self.bootstraps_labels[i] a_tree.learn(XX,y) self.decision_trees[i] = a_tree
def train(self, train_data, train_labels): for i in range(self.num_trees): # data bagging sample_idx = np.random.randint(0, train_data.shape[0], self.sample_size) t_data, t_labels = train_data[sample_idx], train_labels[sample_idx] # attr bagging dtree = DecisionTree(max_depth=self.max_depth) dtree.train(t_data, t_labels, attr_bagging_size=self.feature_size) self.decision_trees.append(dtree)
def __init__(self): self.instances1 = Instance.read('./data/test1.dat') dt = DecisionTree(self.instances1, Id3()) dpath = dt.train() dpath.dump('./data/test1.dat.path') self.path1 = DecisionTreeResult.load('./data/test1.dat.path') self.instances2 = Instance.read('./data/test2.dat') dt = DecisionTree(self.instances2, Id3()) dpath = dt.train() dpath.dump('./data/test2.dat.path') self.path2 = DecisionTreeResult.load('./data/test2.dat.path') self.dtr = DecisionTreeRefiner()
def train(self, training_examples, train_on_subset=True, num_trees=100, features_considered_per_node=2, **kwds): print "Training a decision forest of %d trees, using %d examples, and %d features considered per node." % ( num_trees, len(training_examples), features_considered_per_node) self.trees = [] total_test_output_stats = SummaryStats() binary_classification = all(example["_OUTPUT"] in [0,1] for example in training_examples) #binary_classification = True #for example in training_examples: # output = example["_OUTPUT"] # if output not in [0,1]: # binary_classification = False # break for tree_i in xrange(1, num_trees+1): tree = DecisionTree() self.trees.append(tree) test_set_ids = set(xrange(len(training_examples))) for i in xrange(len(training_examples)): if train_on_subset: # N samples with replacement ("bootstrap") index = random.randint(0, len(training_examples)-1) else: index = i tree.add_example(training_examples[index]) test_set_ids.discard(index) print "Growing tree %d/%d ..." % (tree_i, num_trees), tree.grow_tree(features_considered_per_node=features_considered_per_node) # Report the in-sample training error if binary_classification: print "area-under-curve for %d training examples is %2.2f" % ( len(tree.examples), tree.test(tree.examples, print_level=0)) else: print "%2.2f avg err^2 on %d training examples" % ( tree.avg_squared_error(), len(tree.examples)), # Report the out-of-sample testing error, if we have any out-of-sample # examples to test on. if train_on_subset: print "; ", test_set = [training_examples[i] for i in test_set_ids] if binary_classification: # Do a true out-of-sample test just on this one tree # Temporarily make this a forest-of-one-tree... save_trees = self.trees self.trees = [tree] self.test(test_set) self.trees = save_trees else: avg_squared_error = tree.avg_squared_error(test_set) total_test_output_stats.add(avg_squared_error) print "out-of-sample avg err^2 on %d test cases: %.2f [%.2f avg. for all %d trees so far]" % (len(test_set), avg_squared_error, total_test_output_stats.avg(), tree_i), print
class SPDRWorker: def __init__(self, bins_count=30): self.data = list() self.node2indexes = defaultdict(list) self.decision_tree = DecisionTree() self.histograms = dict() self.bins_count = bins_count def add_object(self, original_class, features): index = len(self.data) self.data.append((original_class, features)) node_index = self.decision_tree.navigate(features) self.node2indexes[node_index].append(index) self.update_histogram_in_tree(node_index, original_class, features) def get_histogram(self, node_index, feature_index, class_key): if node_index not in self.histograms: self.histograms[node_index] = dict() if feature_index not in self.histograms[node_index]: self.histograms[node_index][feature_index] = dict() if class_key not in self.histograms[node_index][feature_index]: self.histograms[node_index][feature_index][class_key] = Histogram(self.bins_count) return self.histograms[node_index][feature_index][class_key] def split_node(self, node_index, feature_index, split_threshold): self.clear_histograms(node_index) left_child, right_child = self.decision_tree.split_node(node_index, feature_index, split_threshold) for index in self.node2indexes[node_index]: class_key, features = self.data[index] if features[feature_index] < split_threshold: self.node2indexes[left_child].append(index) self.update_histogram_in_tree(left_child, class_key, features) else: self.node2indexes[right_child].append(index) self.update_histogram_in_tree(right_child, class_key, features) def update_histogram_in_tree(self, node_index, original_class, features): for feature_index in xrange(len(features)): self.get_histogram(node_index, feature_index, original_class).add(features[feature_index]) def get_classes_from_node_index(self, node_index): result = defaultdict(int) for index in self.node2indexes[node_index]: result[self.data[index][0]] += 1 return result def clear_histograms(self, node_index): if node_index in self.histograms: del(self.histograms[node_index])
def setUp(self): self.tree = DecisionTree() self.data = [{"male": True, "tall": False, "rich": True, "married": False}, {"male": False, "tall": True, "rich": True, "married": True}, {"male": False, "tall": True, "rich": False, "married": True}, {"male": True, "tall": True, "rich": True, "married": True}, {"male": False, "tall": True, "rich": False, "married": True}]
def test_split_data(self): new_sample, new_output, sub_feature_list = DecisionTree._split_data_set( self.sample_input, self.sample_output, 0, 1, self.feature_labels ) np.testing.assert_array_equal(new_sample, np.array([[1], [1], [0]])) self.assertListEqual(new_output, ["yes", "yes", "no"]) self.assertListEqual(sub_feature_list, ["flippers"])
def main(): iris = load_iris() data_train, data_test, label_train, label_test = train_test_split(iris.data, iris.target) dt = DecisionTree() dt.fit(data_train, label_train) pred = dt.predict(data_test) #dt.print_tree() #print(iris) print(data_train) print(label_train) print(data_test) print(label_test) print(pred) print(confusion_matrix(label_test, pred))
def learn(self, features_filepath): decision_tree = DecisionTree() current_node_index = decision_tree.get_next_non_terminal_node() node2indexes = defaultdict(list) data = [] original_G = None _logger.debug("Reading data") index = 0 features_count = None for current_object in ObjectReader().open(features_filepath): original_class = get_class_from_object(current_object) features = current_object.features if features_count is None: features_count = len(features) data.append((original_class, features)) node2indexes[current_node_index].append(index) index += 1 _logger.debug("End reading data") while decision_tree.get_next_non_terminal_node() is not None: current_node_index = decision_tree.get_next_non_terminal_node() data_indexes = node2indexes[current_node_index] _logger.debug("Current node index: " + str(current_node_index)) _logger.debug("Indexes count:" + str(len(data_indexes))) current_G = self.impurity_function.calc(get_classes_count(data, data_indexes)) _logger.debug("Current G: " + str(current_G)) if original_G is None: original_G = current_G if (current_G < self.alpha * original_G) or (len(data_indexes) < self.min_object_in_node): _logger.debug("Stop in node") class_probabilities = dict() for index in data_indexes: cls = data[index][0] if cls not in class_probabilities: class_probabilities[cls] = 0 class_probabilities[cls] += 1 decision_tree.set_node_classification(current_node_index, class_probabilities) continue splits = [] for feature_index in xrange(features_count): values = sorted([data[index][1][feature_index] for index in data_indexes], key=lambda x: float(x)) for j in xrange(self.discretization): splits.append((feature_index, values[j * len(values) / self.discretization])) max_delta = None best_feature_index = None best_split_threshold = None for feature_index, split_threshold in splits: left_indexes = [] right_indexes = [] for index in data_indexes: if data[index][1][feature_index] < split_threshold: left_indexes.append(index) else: right_indexes.append(index) left_G = self.impurity_function.calc(get_classes_count(data, left_indexes)) right_G = self.impurity_function.calc(get_classes_count(data, right_indexes)) tau = 1.0 * len(left_indexes) / len(data_indexes) delta = current_G - tau * left_G - (1 - tau) * right_G if (max_delta is None) or (delta > max_delta): max_delta = delta best_feature_index = feature_index best_split_threshold = split_threshold _logger.debug("Best feature index: " + str(best_feature_index)) _logger.debug("Best split threshold: " + str(best_split_threshold)) _logger.debug("Max delta: " + str(max_delta)) left_child, right_child = decision_tree.split_node(current_node_index, best_feature_index, best_split_threshold) left_indexes = [] right_indexes = [] for index in data_indexes: if data[index][1][best_feature_index] < best_split_threshold: left_indexes.append(index) else: right_indexes.append(index) node2indexes[left_child] = left_indexes node2indexes[right_child] = right_indexes return decision_tree
def test_calculate_shannon_entropy(self): h = DecisionTree._calculate_shannon_entropy(self.sample_output) self.assertAlmostEqual(h, 0.970951, places=5, msg="Shannon entropy should be 0.970951, but get: %f" % h)
# .......................... X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) # Rescale label for Adaboost to {-1, 1} rescaled_y_train = 2*y_train - np.ones(np.shape(y_train)) rescaled_y_test = 2*y_test - np.ones(np.shape(y_test)) # ....... # SETUP # ....... adaboost = Adaboost(n_clf = 8) naive_bayes = NaiveBayes() knn = KNN(k=4) logistic_regression = LogisticRegression() mlp = MultilayerPerceptron(n_hidden=20) perceptron = Perceptron() decision_tree = DecisionTree() random_forest = RandomForest(n_estimators=150) support_vector_machine = SupportVectorMachine(C=1, kernel=rbf_kernel) # ........ # TRAIN # ........ print "Training:" print "\tAdaboost" adaboost.fit(X_train, rescaled_y_train) print "\tNaive Bayes" naive_bayes.fit(X_train, y_train) print "\tLogistic Regression" logistic_regression.fit(X_train, y_train) print "\tMultilayer Perceptron" mlp.fit(X_train, y_train, n_iterations=20000, learning_rate=0.1)
#!coding: utf-8 """ This is a demo implement for <<统计学习方法>>.5.3 ID3 example """ import numpy as np from decision_tree import DecisionTree if __name__ == "__main__": dat = np.loadtxt("./dat.csv", dtype="i", delimiter=",", usecols=range(1, 5)) # AGE,WORK,HOUSE,PASS dt = DecisionTree(column_names=("age", "work", "house", "pass")) tree = dt.training(dat) tree_dict = tree.to_dict()["children"]["root"] from utils import tree_plot from matplotlib import pyplot as plt print tree_dict tree_plot.show_tree(plt, tree_dict) plt.show()
def q14(): tree = DecisionTree() tree.fit(*load_train()) print tree.ein
def __init__(self, bins_count=30): self.data = list() self.node2indexes = defaultdict(list) self.decision_tree = DecisionTree() self.histograms = dict() self.bins_count = bins_count
def q15(): tree = DecisionTree() tree.fit(*load_train()) print tree.error(*load_test())
def learn(self, features_filepath): start_time = time.time() k1 = 0 block1_time = 0 k2 = 0 block2_time = 0 features_used = dict() decision_tree = DecisionTree() workers = [SPDRWorker(self.worker_bins_count) for _ in xrange(self.workers_count)] original_G = None features_count = None classes = set() _logger.debug("Reading data") index = 0 for current_object in ObjectReader().open(features_filepath): original_class = get_class_from_object(current_object) features = current_object.features if features_count is None: features_count = len(features) classes.add(original_class) workers[index].add_object(original_class, features) index = (index + 1) % self.workers_count _logger.debug("End reading data") while decision_tree.get_next_non_terminal_node() is not None: current_node_index = decision_tree.get_next_non_terminal_node() _logger.debug("Current node: " + str(current_node_index)) histograms = [] feature_histogram = [] for feature_index in xrange(features_count): histograms.append(dict()) feature_histogram.append(Histogram(self.worker_bins_count)) for class_key in classes: if class_key not in histograms[feature_index]: histograms[feature_index][class_key] = Histogram(self.worker_bins_count) for worker in workers: histograms[feature_index][class_key].merge(worker.get_histogram(current_node_index, feature_index, class_key)) feature_histogram[feature_index].merge(histograms[feature_index][class_key]) classes_in_node_index = defaultdict(int) total_elements = 0 for class_key in classes: classes_in_node_index[class_key] = histograms[0][class_key].get_total_elements() total_elements += histograms[0][class_key].get_total_elements() _logger.debug("Total elements: " + str(total_elements)) k1 += 1 cur_time = time.time() for worker in workers: worker.clear_histograms(current_node_index) block1_time += time.time() - cur_time decision_tree.set_node_classification(current_node_index, classes_in_node_index) if not total_elements: raise BaseException("F**k!") current_G = self.impurity_function.calc(classes_in_node_index) current_R = self.regularization(classes_in_node_index) _logger.debug("Impurity: " + str(current_G)) _logger.debug("Regularization: " + str(current_R)) if original_G is None: original_G = current_G if (current_G < self.alpha * original_G) or (total_elements < self.min_object_in_node): _logger.debug("Stop in node, total elements: " + str(total_elements)) continue splits = [] for feature_index in xrange(features_count): min_value, max_value = feature_histogram[feature_index].get_min_max_elements() values = feature_histogram[feature_index].uniform(self.discretization + 1) for value in values: if (min_value < value) and (value < max_value): splits.append((feature_index, value)) max_delta = None best_feature_index = None best_split_threshold = None for feature_index, split_threshold in splits: tau = feature_histogram[feature_index].sum(split_threshold) / feature_histogram[feature_index].get_total_elements() classes_in_left = dict() classes_in_right = dict() for class_key in histograms[feature_index]: classes_in_left[class_key] = 0 classes_in_right[class_key] = histograms[feature_index][class_key].get_total_elements() if histograms[feature_index][class_key].get_total_elements() > 0: classes_in_left[class_key] = histograms[feature_index][class_key].sum(split_threshold) classes_in_right[class_key] -= classes_in_left[class_key] left_R = self.regularization(classes_in_left) right_R = self.regularization(classes_in_right) delta = current_G - tau * (self.impurity_function.calc(classes_in_left) + left_R) - (1 - tau) * (self.impurity_function.calc(classes_in_right) + right_R) if (max_delta is None) or (max_delta < delta): max_delta = delta best_feature_index = feature_index best_split_threshold = split_threshold _logger.debug("Best feature index: " + str(best_feature_index)) _logger.debug("Best split threshold: " + str(best_split_threshold)) _logger.debug("Max delta: " + str(max_delta)) if (max_delta is not None) or (max_delta > 0): decision_tree.split_node(current_node_index, best_feature_index, best_split_threshold) if best_feature_index not in features_used: features_used[best_feature_index] = 0 features_used[best_feature_index] += 1 k2 += 1 cur_time = time.time() for worker in workers: worker.split_node(current_node_index, best_feature_index, best_split_threshold) block2_time += time.time() - cur_time _logger.info(features_used) total_time = time.time() - start_time _logger.info("Total time: " + str(total_time)) _logger.info("k1: " + str(k1)) _logger.info("block1 time: " + str(block1_time)) _logger.info("k2: " + str(k2)) _logger.info("block2 time: " + str(block2_time)) parallel_time = (total_time - block1_time - block2_time) + (block1_time + block2_time) / self.workers_count _logger.info("Parallel time: " + str(parallel_time)) _logger.info("Acceleration: " + str(total_time / parallel_time)) _logger.info("Efficiency: " + str(total_time / parallel_time / self.workers_count)) _logger.info("Workers count: " + str(self.workers_count)) _logger.info("Bins count: " + str(self.worker_bins_count)) if self.info_filepath is not None: print >> self.info_filepath, "\t".join(map(str, [total_time, k1, block1_time, k2, block2_time, parallel_time, total_time / parallel_time, total_time / parallel_time / self.workers_count, self.workers_count, self.worker_bins_count])) return decision_tree
def q13(): tree = DecisionTree() tree.fit(*load_train()) print tree.__prepr__() print tree.node_count
def evaluate_performance(): ''' Evaluate the performance of decision trees and logistic regression, average over 1,000 trials of 10-fold cross validation Return: a matrix giving the performance that will contain the following entries: stats[0,0] = mean accuracy of decision tree stats[0,1] = std deviation of decision tree accuracy stats[1,0] = mean accuracy of logistic regression stats[1,1] = std deviation of logistic regression accuracy ** Note that your implementation must follow this API** ''' # Load Data filename = 'data/SPECTF.dat' data = np.loadtxt(filename, delimiter=',') X = data[:, 1:] y = np.array([data[:, 0]]).T n, d = X.shape for trial in range(1000): # TODO: shuffle for each of the trials. # the following code is for reference only. idx = np.arange(n) np.random.seed(13) np.random.shuffle(idx) X = X[idx] y = y[idx] # TODO: write your own code to split data (for cross validation) # the code here is for your reference. Xtrain = X[1:101, :] # train on first 100 instances Xtest = X[101:, :] ytrain = y[1:101, :] # test on remaining instances ytest = y[101:, :] # train the decision tree classifier = DecisionTree(100) classifier.fit(Xtrain, ytrain) # output predictions on the remaining data y_pred = classifier.predict(Xtest) accuracy = accuracy_score(ytest, y_pred) break # compute the training accuracy of the model meanDecisionTreeAccuracy = np.mean(all_accuracies) # TODO: update these statistics based on the results of your experiment stddevDecisionTreeAccuracy = 0 meanLogisticRegressionAccuracy = 0 stddevLogisticRegressionAccuracy = 0 meanRandomForestAccuracy = 0 stddevRandomForestAccuracy = 0 # make certain that the return value matches the API specification stats = np.zeros((2, 2)) stats[0, 0] = meanDecisionTreeAccuracy stats[0, 1] = stddevDecisionTreeAccuracy stats[1, 0] = meanRandomForestAccuracy stats[1, 1] = stddevRandomForestAccuracy stats[2, 0] = meanLogisticRegressionAccuracy stats[2, 1] = stddevLogisticRegressionAccuracy return stats
def test_choose_feature_to_split(self): decision_tree = DecisionTree() feature_to_split = decision_tree._select_feature_to_split(self.sample_input, self.sample_output) self.assertEqual(feature_to_split, 0, "The best feature index to pick is 0, but get %d" % feature_to_split)
def test_train_method(self): decision_tree = DecisionTree() decision_tree.train(self.sample_input, self.sample_output, feature_label=self.feature_labels) self.assertIsNotNone(decision_tree.root, "Decision tree must have a root node")