def construct_general_tree(verbose,heading,complete_data,enquired_column,m):
	available_columns=[]
	for col in range(0,len(heading)):
		if col!=enquired_column:
			available_columns.append(col)
	tree=TreeNode()
	printfv(2,verbose,"We start the construction with the root node to create the first node of the tree.\n")
	add_children_to_node(verbose,tree,heading,complete_data,available_columns,enquired_column,m)
	return tree
def construct_random_decision_tree(verbose, heading, complete_data,
                                   enquired_column, m):
    sample = sample_with_replacement(complete_data, len(complete_data))
    printfv(2, verbose, "We are given %d features as the input data. " +
            "Out of these, we choose randomly %d features with the " +
            "replacement that we will use for the construction of " +
            "this particular random decision tree:\n" +
            str(sample) + "\n", len(complete_data),
            len(complete_data))
    return decision_tree.construct_general_tree(verbose, heading,
                                                sample,
                                                enquired_column, m)
Esempio n. 3
0
def select_col(verbose, heading, complete_data, available_columns,
               enquired_column, m):
    printfv(
        2, verbose, "The available variables that we have still left are " +
        str(numbers_to_strings(available_columns, heading)) + ". ")
    if len(available_columns) < m:
        printfv(
            2, verbose, "As there are fewer of them than the " +
            "parameter m=%d, we consider all of them. ", m)
        sample_columns = available_columns
    else:
        sample_columns = random.sample(available_columns, m)
        printfv(
            2, verbose, "We choose a subset of them of size m to be " +
            str(numbers_to_strings(available_columns, heading)) + ".")

    selected_col = -1
    selected_col_information_gain = -1
    for col in sample_columns:
        current_information_gain = col_information_gain(
            complete_data, col, enquired_column)
        # print len(complete_data),col,current_information_gain
        if current_information_gain > selected_col_information_gain:
            selected_col = col
            selected_col_information_gain = current_information_gain
    printfv(
        2, verbose, "Out of these variables, the variable with " +
        "the highest information gain is the variable " +
        heading[selected_col] +
        ". Thus we will branch the node further on this " + "variable. " +
        "We also remove this variable from the list of the " +
        "available variables for the children of the current node. ")
    return selected_col
def display_classification_for_feature(verbose, random_forest, heading,
                                       enquired_column, feature):
    classification = {}
    for i in range(0, len(random_forest)):
        group = decision_tree.classify_by_tree(
            random_forest[i], heading, enquired_column, feature)
        common.dic_inc(classification, group)
        printfv(0, verbose, "Tree " + str(i) +
                " votes for the class: " + str(group) + "\n")
    printfv(0, verbose, "The class with the maximum number of votes " +
            "is '" + str(common.dic_key_max_count(classification)) +
            "'. Thus the constructed random forest classifies the " +
            "feature " + str(feature) + " into the class '" +
            str(common.dic_key_max_count(classification)) + "'.\n")
def construct_random_forest(verbose, heading, complete_data,
                            enquired_column, m, tree_count):
    printfv(2, verbose, "*** Random Forest construction ***\n")
    printfv(2, verbose, "We construct a random forest that will " +
            "consist of %d random decision trees.\n", tree_count)
    random_forest = []
    for i in range(0, tree_count):
        printfv(2, verbose, "\nConstruction of a random " +
                "decision tree number %d:\n", i)
        random_forest.append(construct_random_decision_tree(
            verbose, heading, complete_data, enquired_column, m))
    printfv(2, verbose, "\nTherefore we have completed the " +
            "construction of the random forest consisting of %d " +
            "random decision trees.\n", tree_count)
    return random_forest
def display_classification(verbose, random_forest, heading,
                           enquired_column, incomplete_data):
    printfv(0, verbose, "\n***Classification***\n")
    printfv(3, verbose, "Since for the construction of a random " +
            "decision tree we use only a subset of the original data," +
            " we may not have enough features to form a full tree " +
            "that is able to classify every feature. In such a case" +
            " a tree will not return any class for a particular " +
            "feature that should be classified. Thus we will only " +
            "consider trees that actually classify a feature to " +
            "some specific class.")
    if len(incomplete_data) == 0:
        printfv(0, verbose, "No data to classify.\n")
    else:
        for incomplete_feature in incomplete_data:
            printfv(0, verbose, "\nFeature: " +
                    str(incomplete_feature) + "\n")
            display_classification_for_feature(
                verbose, random_forest, heading,
                enquired_column, incomplete_feature)
Esempio n. 7
0
import decision_tree

# Program start
if len(sys.argv) < 3:
    sys.exit('Please, input as arguments:\n' +
             '1. the name of the input CSV file.\n' +
             '2. the level of verbosity: 0 or 2\n' +
             '   0 - output only the decision tree,\n' +
             '   1 - also provide some basic information on the ' +
             'construction,' +
             '   2 - in addition provide the explanations of the ' +
             'decision tree construction.\n\n' + 'Example use:\n' +
             'python construct_decision_tree.py swim.csv 1')

csv_file_name = sys.argv[1]
verbose = int(sys.argv[2])  # verbosity level, 0 - only decision tree

# Define the equired column to be the last one.
# I.e. a column defining the decision variable.
(heading, complete_data, incomplete_data,
 enquired_column) = common.csv_file_to_ordered_data(csv_file_name)

printfv(
    1, verbose, "We construct a decision tree given the following " +
    str(len(complete_data)) + " data items: \n" + str(complete_data) + "\n\n")
tree = decision_tree.constuct_decision_tree(verbose, heading, complete_data,
                                            enquired_column)
printfv(2, verbose, "\n")
printfv(1, verbose, "***Decision tree graph***\n")
decision_tree.display_tree(tree)
Esempio n. 8
0
def add_children_to_node(verbose, node, heading, complete_data,
                         available_columns, enquired_column, m):
    if len(available_columns) == 0:
        printfv(
            2, verbose, "We do not have any available variables " +
            "on which we could split the node further, therefore " +
            "we add a leaf node to the current branch of the tree. ")
        add_leaf(verbose, node, heading, complete_data, enquired_column)
        return -1

    printfv(2, verbose,
            "We would like to add children to the node " + node.name() + ".\n")

    selected_col = select_col(verbose, heading, complete_data,
                              available_columns, enquired_column, m)
    for i in range(0, len(available_columns)):
        if available_columns[i] == selected_col:
            available_columns.pop(i)
            break

    data_groups = split_data_by_col(complete_data, selected_col)
    if (len(data_groups.items()) == 1):
        printfv(
            2, verbose, "For the chosen variable " + heading[selected_col] +
            " all the remaining features have the same value " +
            complete_data[0][selected_col] + ". " +
            "Thus we close the branch with a leaf node. ")
        add_leaf(verbose, node, heading, complete_data, enquired_column)
        return -1

    if verbose >= 2:
        printfv(
            2, verbose, "Using the variable " + heading[selected_col] +
            " we partition the data in the current node, where" +
            " each partition of the data will be for one of the " +
            "new branches from the current node " + node.name() + ". " +
            "We have the following partitions:\n")
        for child_group, child_data in data_groups.items():
            printfv(
                2, verbose, "Partition for " + str(heading[selected_col]) +
                "=" + str(child_data[0][selected_col]) + ": " +
                str(child_data) + "\n")
        printfv(
            2, verbose, "Now, given the partitions, let us form the " +
            "branches and the child nodes.\n")
    for child_group, child_data in data_groups.items():
        child = TreeNode(heading[selected_col], child_group)
        printfv(
            2, verbose, "\nWe add a child node " + child.name() +
            " to the node " + node.name() + ". " +
            "This branch classifies %d feature(s): " + str(child_data) + "\n",
            len(child_data))
        add_children_to_node(verbose, child, heading, child_data,
                             list(available_columns), enquired_column, m)
        node.add_child(child)
    printfv(
        2, verbose, "\nNow, we have added all the children nodes for the " +
        "node " + node.name() + ".\n")
Esempio n. 9
0
def add_leaf(verbose, node, heading, complete_data, enquired_column):
    leaf_node = TreeNode(heading[enquired_column],
                         complete_data[0][enquired_column])
    printfv(2, verbose, "We add the leaf node " + leaf_node.name() + ".\n")
    node.add_child(leaf_node)
def choose_m(verbose, M):
    m = int(min(M, math.ceil(2 * math.sqrt(M))))
    printfv(2, verbose, "We are given M=" + str(M) +
            " variables according to which a feature can be " +
            "classified. ")
    printfv(3, verbose, "In random forest algorithm we usually do " +
            "not use all " + str(M) + " variables to form tree " +
            "branches at each node. ")
    printfv(3, verbose, "We use only m variables out of M. ")
    printfv(3, verbose, "So we choose m such that m is less than or " +
            "equal to M. ")
    printfv(3, verbose, "The greater m is, a stronger classifier an " +
            "individual tree constructed is. However, it is more " +
            "susceptible to a bias as more of the data is considered. " +
            "Since we in the end use multiple trees, even if each may " +
            "be a weak classifier, their combined classification " +
            "accuracy is strong. Therefore as we want to reduce a " +
            "bias in a random forest, we may want to consider to " +
            "choose a parameter m to be slightly less than M.\n")
    printfv(2, verbose, "Thus we choose the maximum number of the " +
            "variables considered at the node to be " +
            "m=min(M,math.ceil(2*math.sqrt(M)))" +
            "=min(M,math.ceil(2*math.sqrt(%d)))=%d.\n", M, m)
    return m
             '\t0 for the least output - result of the classification,\n' +
             '\t1 includes in addition the output of the trees constructed ' +
             'and the result of the classification,\n' +
             '\t2 includes in addition brief explanations of the tree ' +
             'construction and classification,\n' +
             '\t3 includes detailed explanations of the algorithm.\n')

csv_file_name = sys.argv[1]
tree_count = int(sys.argv[2])
verbose = int(sys.argv[3])

(heading, complete_data, incomplete_data,
 enquired_column) = common.csv_file_to_ordered_data(csv_file_name)
m = choose_m(verbose, len(heading))
printfv(2, verbose, "We are given the following features:\n" +
        str(complete_data) + "\n When constructing a random " +
        "decision tree as a part of a random forest, we will choose " +
        "only a subset out of them in a random way with the " +
        "replacement.\n\n")

random_forest = construct_random_forest(
    verbose, heading, complete_data, enquired_column, m, tree_count)
display_forest(verbose, random_forest)
printfv(2, verbose, "\n")
printfv(0, verbose, "Total number of trees in the random forest=%d.\n",
        len(random_forest))
printfv(0, verbose, "The maximum number of the variables considered " +
        "at the node is m=%d.\n", m)
display_classification(verbose, random_forest, heading,
                       enquired_column, incomplete_data)