def construct_general_tree(verbose,heading,complete_data,enquired_column,m): available_columns=[] for col in range(0,len(heading)): if col!=enquired_column: available_columns.append(col) tree=TreeNode() printfv(2,verbose,"We start the construction with the root node to create the first node of the tree.\n") add_children_to_node(verbose,tree,heading,complete_data,available_columns,enquired_column,m) return tree
def construct_random_decision_tree(verbose, heading, complete_data, enquired_column, m): sample = sample_with_replacement(complete_data, len(complete_data)) printfv(2, verbose, "We are given %d features as the input data. " + "Out of these, we choose randomly %d features with the " + "replacement that we will use for the construction of " + "this particular random decision tree:\n" + str(sample) + "\n", len(complete_data), len(complete_data)) return decision_tree.construct_general_tree(verbose, heading, sample, enquired_column, m)
def select_col(verbose, heading, complete_data, available_columns, enquired_column, m): printfv( 2, verbose, "The available variables that we have still left are " + str(numbers_to_strings(available_columns, heading)) + ". ") if len(available_columns) < m: printfv( 2, verbose, "As there are fewer of them than the " + "parameter m=%d, we consider all of them. ", m) sample_columns = available_columns else: sample_columns = random.sample(available_columns, m) printfv( 2, verbose, "We choose a subset of them of size m to be " + str(numbers_to_strings(available_columns, heading)) + ".") selected_col = -1 selected_col_information_gain = -1 for col in sample_columns: current_information_gain = col_information_gain( complete_data, col, enquired_column) # print len(complete_data),col,current_information_gain if current_information_gain > selected_col_information_gain: selected_col = col selected_col_information_gain = current_information_gain printfv( 2, verbose, "Out of these variables, the variable with " + "the highest information gain is the variable " + heading[selected_col] + ". Thus we will branch the node further on this " + "variable. " + "We also remove this variable from the list of the " + "available variables for the children of the current node. ") return selected_col
def display_classification_for_feature(verbose, random_forest, heading, enquired_column, feature): classification = {} for i in range(0, len(random_forest)): group = decision_tree.classify_by_tree( random_forest[i], heading, enquired_column, feature) common.dic_inc(classification, group) printfv(0, verbose, "Tree " + str(i) + " votes for the class: " + str(group) + "\n") printfv(0, verbose, "The class with the maximum number of votes " + "is '" + str(common.dic_key_max_count(classification)) + "'. Thus the constructed random forest classifies the " + "feature " + str(feature) + " into the class '" + str(common.dic_key_max_count(classification)) + "'.\n")
def construct_random_forest(verbose, heading, complete_data, enquired_column, m, tree_count): printfv(2, verbose, "*** Random Forest construction ***\n") printfv(2, verbose, "We construct a random forest that will " + "consist of %d random decision trees.\n", tree_count) random_forest = [] for i in range(0, tree_count): printfv(2, verbose, "\nConstruction of a random " + "decision tree number %d:\n", i) random_forest.append(construct_random_decision_tree( verbose, heading, complete_data, enquired_column, m)) printfv(2, verbose, "\nTherefore we have completed the " + "construction of the random forest consisting of %d " + "random decision trees.\n", tree_count) return random_forest
def display_classification(verbose, random_forest, heading, enquired_column, incomplete_data): printfv(0, verbose, "\n***Classification***\n") printfv(3, verbose, "Since for the construction of a random " + "decision tree we use only a subset of the original data," + " we may not have enough features to form a full tree " + "that is able to classify every feature. In such a case" + " a tree will not return any class for a particular " + "feature that should be classified. Thus we will only " + "consider trees that actually classify a feature to " + "some specific class.") if len(incomplete_data) == 0: printfv(0, verbose, "No data to classify.\n") else: for incomplete_feature in incomplete_data: printfv(0, verbose, "\nFeature: " + str(incomplete_feature) + "\n") display_classification_for_feature( verbose, random_forest, heading, enquired_column, incomplete_feature)
import decision_tree # Program start if len(sys.argv) < 3: sys.exit('Please, input as arguments:\n' + '1. the name of the input CSV file.\n' + '2. the level of verbosity: 0 or 2\n' + ' 0 - output only the decision tree,\n' + ' 1 - also provide some basic information on the ' + 'construction,' + ' 2 - in addition provide the explanations of the ' + 'decision tree construction.\n\n' + 'Example use:\n' + 'python construct_decision_tree.py swim.csv 1') csv_file_name = sys.argv[1] verbose = int(sys.argv[2]) # verbosity level, 0 - only decision tree # Define the equired column to be the last one. # I.e. a column defining the decision variable. (heading, complete_data, incomplete_data, enquired_column) = common.csv_file_to_ordered_data(csv_file_name) printfv( 1, verbose, "We construct a decision tree given the following " + str(len(complete_data)) + " data items: \n" + str(complete_data) + "\n\n") tree = decision_tree.constuct_decision_tree(verbose, heading, complete_data, enquired_column) printfv(2, verbose, "\n") printfv(1, verbose, "***Decision tree graph***\n") decision_tree.display_tree(tree)
def add_children_to_node(verbose, node, heading, complete_data, available_columns, enquired_column, m): if len(available_columns) == 0: printfv( 2, verbose, "We do not have any available variables " + "on which we could split the node further, therefore " + "we add a leaf node to the current branch of the tree. ") add_leaf(verbose, node, heading, complete_data, enquired_column) return -1 printfv(2, verbose, "We would like to add children to the node " + node.name() + ".\n") selected_col = select_col(verbose, heading, complete_data, available_columns, enquired_column, m) for i in range(0, len(available_columns)): if available_columns[i] == selected_col: available_columns.pop(i) break data_groups = split_data_by_col(complete_data, selected_col) if (len(data_groups.items()) == 1): printfv( 2, verbose, "For the chosen variable " + heading[selected_col] + " all the remaining features have the same value " + complete_data[0][selected_col] + ". " + "Thus we close the branch with a leaf node. ") add_leaf(verbose, node, heading, complete_data, enquired_column) return -1 if verbose >= 2: printfv( 2, verbose, "Using the variable " + heading[selected_col] + " we partition the data in the current node, where" + " each partition of the data will be for one of the " + "new branches from the current node " + node.name() + ". " + "We have the following partitions:\n") for child_group, child_data in data_groups.items(): printfv( 2, verbose, "Partition for " + str(heading[selected_col]) + "=" + str(child_data[0][selected_col]) + ": " + str(child_data) + "\n") printfv( 2, verbose, "Now, given the partitions, let us form the " + "branches and the child nodes.\n") for child_group, child_data in data_groups.items(): child = TreeNode(heading[selected_col], child_group) printfv( 2, verbose, "\nWe add a child node " + child.name() + " to the node " + node.name() + ". " + "This branch classifies %d feature(s): " + str(child_data) + "\n", len(child_data)) add_children_to_node(verbose, child, heading, child_data, list(available_columns), enquired_column, m) node.add_child(child) printfv( 2, verbose, "\nNow, we have added all the children nodes for the " + "node " + node.name() + ".\n")
def add_leaf(verbose, node, heading, complete_data, enquired_column): leaf_node = TreeNode(heading[enquired_column], complete_data[0][enquired_column]) printfv(2, verbose, "We add the leaf node " + leaf_node.name() + ".\n") node.add_child(leaf_node)
def choose_m(verbose, M): m = int(min(M, math.ceil(2 * math.sqrt(M)))) printfv(2, verbose, "We are given M=" + str(M) + " variables according to which a feature can be " + "classified. ") printfv(3, verbose, "In random forest algorithm we usually do " + "not use all " + str(M) + " variables to form tree " + "branches at each node. ") printfv(3, verbose, "We use only m variables out of M. ") printfv(3, verbose, "So we choose m such that m is less than or " + "equal to M. ") printfv(3, verbose, "The greater m is, a stronger classifier an " + "individual tree constructed is. However, it is more " + "susceptible to a bias as more of the data is considered. " + "Since we in the end use multiple trees, even if each may " + "be a weak classifier, their combined classification " + "accuracy is strong. Therefore as we want to reduce a " + "bias in a random forest, we may want to consider to " + "choose a parameter m to be slightly less than M.\n") printfv(2, verbose, "Thus we choose the maximum number of the " + "variables considered at the node to be " + "m=min(M,math.ceil(2*math.sqrt(M)))" + "=min(M,math.ceil(2*math.sqrt(%d)))=%d.\n", M, m) return m
'\t0 for the least output - result of the classification,\n' + '\t1 includes in addition the output of the trees constructed ' + 'and the result of the classification,\n' + '\t2 includes in addition brief explanations of the tree ' + 'construction and classification,\n' + '\t3 includes detailed explanations of the algorithm.\n') csv_file_name = sys.argv[1] tree_count = int(sys.argv[2]) verbose = int(sys.argv[3]) (heading, complete_data, incomplete_data, enquired_column) = common.csv_file_to_ordered_data(csv_file_name) m = choose_m(verbose, len(heading)) printfv(2, verbose, "We are given the following features:\n" + str(complete_data) + "\n When constructing a random " + "decision tree as a part of a random forest, we will choose " + "only a subset out of them in a random way with the " + "replacement.\n\n") random_forest = construct_random_forest( verbose, heading, complete_data, enquired_column, m, tree_count) display_forest(verbose, random_forest) printfv(2, verbose, "\n") printfv(0, verbose, "Total number of trees in the random forest=%d.\n", len(random_forest)) printfv(0, verbose, "The maximum number of the variables considered " + "at the node is m=%d.\n", m) display_classification(verbose, random_forest, heading, enquired_column, incomplete_data)