def run_id3(data, test_data, metric, tree_depth, data_percents, train_data_percents): id3 = Id3.Id3(metric) print("\n--- Using Tree level " + str(tree_depth) + " ---") id3.fit(data.examples, data.attributes, None, data.labels, 0, tree_depth) correct_results = 0 for example in test_data.examples: if example.get_label() == id3.predict(example): correct_results += 1 percentage = float(correct_results) / float(len(test_data.examples)) if data_percents is not None: data_percents.append(percentage) print("Test Error: " + "%.16f" % (1.0 - percentage)) correct_results = 0 for example in data.examples: if example.get_label() == id3.predict(example): correct_results += 1 percentage = float(correct_results) / float(len(data.examples)) if train_data_percents is not None: train_data_percents.append(percentage) print("Training Error: " + "%.16f" % (1.0 - percentage)) max_height = id3.max_height id3.reset_max_height() return max_height
def forest_bagged_cross_comparison(): # Train data dir_path = os.path.dirname(os.path.realpath(__file__)) data = BankData.Data() data.initialize_data_from_file(dir_path + '/../../Data/bank/train.csv', False) # Test data test_data = BankData.Data() test_data.initialize_data_from_file(dir_path + '/../../Data/bank/test.csv', False) random_forests = [] full_trees = [] size = int( input( "Please enter a number for the cardinality of the set of random attributes:\n" )) counter = 1 toolbar_width = 100 print("Building trees") sys.stdout.write("Progress: [%s]" % (" " * toolbar_width)) sys.stdout.flush() for i in range(0, 100): # sample 1000 features uniformly without replacement examples = get_samples(data) forest = RandomForests.RandomForests(t_value=100, features=examples, attributes=data.attributes, size=size) forest.fit(print_status_bar=False) random_forests.append(forest) id3 = Id3.Id3(metric='information_gain') full_trees.append( id3.fit(examples, data.attributes, None, data.labels, 0, float("inf"))) sys.stdout.write('\r') sys.stdout.flush() sys.stdout.write('Progress: [%s' % ('#' * counter)) sys.stdout.write('%s]' % (' ' * (toolbar_width - counter))) sys.stdout.flush() counter += 1 print("\nCalculating squared mean error of full trees.") full_trees_results = get_squared_mean_error_np(data, full_trees, False) print("\nMean Squared Error for the full trees is: " + "%.16f" % (full_trees_results[0] + full_trees_results[1])) print("Bias was %s, Variance was %s" % (full_trees_results[0], full_trees_results[1])) print("Calculating squared mean error for bagged trees.") random_forest_results = get_squared_mean_error_np(data, random_forests, True) print("\nMean Squared Error for the bagged trees is: " + "%.16f" % (random_forest_results[0] + random_forest_results[1])) print("Bias was %s, Variance was %s" % (random_forest_results[0], random_forest_results[1]))
def fit(self, print_status=False): """ train Adaboost :param print_status: set to True if a status printout is desired :type print_status: boolean :return: None :rtype: None """ if print_status: print("Building AdaBoost trees") sys.stdout.write("Progress: 0 / %s" % self.t_value) sys.stdout.flush() for i in range(0, self.t_value): id3 = Id3.Id3(metric='weighted_information_gain') id3.fit(features=self.features, attributes=self.attributes, prev_value=None, label_set=(-1, 1), current_depth=0, max_depth=1) self.h_classifiers[i] = id3 # Get predictions for i_, feature in enumerate(self.features): self.h_predictions[i_] = float(id3.predict(feature)) epsilon = self.get_epsilon() self.alphas[i] = 0.5 * np.log((1.0 - epsilon) / epsilon) self.update_dt(self.alphas[i]) # Update weights for j, feature in enumerate(self.features): feature.set_weight(self.dt[j]) if print_status: sys.stdout.write("\rProgress: %s / %s" % (i + 1, self.t_value)) sys.stdout.flush() if print_status: sys.stdout.write('\n') sys.stdout.flush()
def fit(self, print_status_bar): """ train random forests :param print_status_bar: set to True if a status printout is desired :type print_status_bar: boolean :return: None :rtype: None """ counter = 1 toolbar_width = 100 factor = int(self.t_value / toolbar_width) factor = max(factor, 1) if print_status_bar: print("Building Bagging Trees") sys.stdout.write("Progress: [%s]" % (" " * toolbar_width)) sys.stdout.flush() for i in range(0, self.t_value): # Get bootstrap example bootstrap_sample = get_bootstrap_sample(self.features) id3 = Id3.Id3(metric='information_gain') id3.fit(features=bootstrap_sample, attributes=self.attributes, prev_value=None, label_set=(-1, 1), current_depth=0, max_depth=float("inf"), rand_attribute_size=self.size) self.forest.append(id3) if i % factor == 0 and print_status_bar: sys.stdout.write('\r') sys.stdout.flush() sys.stdout.write('Progress: [%s' % ('#' * counter)) sys.stdout.write('%s]' % (' ' * (toolbar_width - counter))) sys.stdout.flush() counter += 1 if print_status_bar: print("")
def run_cross_comparison(): print_error_calculation_status = False dir_path = os.path.dirname(os.path.realpath(__file__)) data = CreditDefaultData.Data() data.initialize_data_from_file(dir_path + '/../../Data/credit/credit.csv') iterates = [1, 2, 5, 10, 15, 20, 30, 40, 50, 70, 90, 100] it_sum = float(np.sum(iterates)) ada_boost_train_error = [] ada_boost_test_error = [] bagging_trees_train_error = [] bagging_trees_test_error = [] random_forests_train_error = [] random_forests_test_error = [] ada_trees = AdaBoost.Adaboost(features=data.train_examples, attributes=data.attributes, t_value=100) ada_trees.fit(print_status=True) bag_trees = BaggingTrees.BaggingTrees(features=data.train_examples, attributes=data.attributes, t_value=100, attribute_factor=2) bag_trees.fit(print_status_bar=True) r_forest = RandomForests.RandomForests(features=data.train_examples, attributes=data.attributes, t_value=100, size=4) r_forest.fit(print_status_bar=True) print("Tree construction complete. Calculating Data.") toolbar_width = 100 if print_error_calculation_status: sys.stdout.write("Progress: [%s]" % (" " * toolbar_width)) sys.stdout.flush() for i in range(0, len(iterates)): ada_boost_train_error.append(get_error(data.train_examples, ada_trees)) ada_boost_test_error.append(get_error(data.test_examples, ada_trees)) bagging_trees_train_error.append( get_error(data.train_examples, bag_trees)) bagging_trees_test_error.append( get_error(data.test_examples, bag_trees)) random_forests_train_error.append( get_error(data.train_examples, r_forest)) random_forests_test_error.append( get_error(data.test_examples, r_forest)) if print_error_calculation_status: counter = int( (float(np.sum(iterates[:i + 1])) / it_sum) * toolbar_width) sys.stdout.write('\r') sys.stdout.flush() sys.stdout.write('Progress: [%s' % ('#' * counter)) sys.stdout.write('%s]' % (' ' * (toolbar_width - counter))) sys.stdout.flush() print("") plt.plot(iterates, ada_boost_train_error, label='Ada Train') plt.plot(iterates, ada_boost_test_error, label='Ada Test') plt.plot(iterates, bagging_trees_train_error, label='Bag Train', marker='<') plt.plot(iterates, bagging_trees_test_error, label='Bag Test', marker='<') plt.plot(iterates, random_forests_train_error, label='Forest Train', marker='o') plt.plot(iterates, random_forests_test_error, label='Forest Test', marker='o') plt.legend(loc='best') plt.show() print("Now running statistics on a fully developed decision tree.") id3 = Id3.Id3(metric='information_gain') id3.fit(data.train_examples, data.attributes, None, data.labels, 0, float("inf")) dec_tree_train_error = get_error(data.train_examples, id3) dec_tree_test_error = get_error(data.test_examples, id3) print("Train error for full decision tree is %.16f" % dec_tree_train_error) print("Test error for full decision tree is %.16f" % dec_tree_test_error)