Example #1
0
    def read_tree(self, parent_line, token_line):
        parents = list(map(lambda x: int(x) - 1,parent_line.split()))
        tokens = token_line.strip().split()
        tree_nodes = dict()
        root = None
        for i in range(len(parents)):
            crnt_node_id = i
            if crnt_node_id not in tree_nodes.keys():
                prev_node = None
                while True:
                    if crnt_node_id == -1:
                        break
                    parent_node_id = parents[crnt_node_id]

                    crnt_node = TreeNode()
                    if prev_node is not None:
                        crnt_node.add_child(prev_node)
                    tree_nodes[crnt_node_id] = crnt_node
                    crnt_node.idx = crnt_node_id
                    crnt_node.token = tokens[crnt_node_id]
                    #if trees[parent-1] is not None:
                    if parent_node_id in tree_nodes.keys():
                        tree_nodes[parent_node_id].add_child(crnt_node)
                        break
                    elif parent_node_id == -1:
                        root = crnt_node
                        break
                    else:
                        prev_node = crnt_node
                        crnt_node_id = parent_node_id
        return root
Example #2
0
    def __decision_tree(self, X, Y, features, level, metric, classes):
        # returns the root of the Decision Tree(which consists of TreeNodes) built after fitting the training data
        # Here Nodes are printed as in PREORDER traversl
        # classes represents the different classes present in the classification problem
        # metric can take value gain_ratio or gini_index
        # level represents depth of the tree
        # We split a node on a particular feature only once (in a given root to leaf node path)

        # If the node consists of only 1 class
        if len(set(Y)) == 1:
            print("Level", level)
            output = None
            for i in classes:
                if i in Y:
                    output = i
                    print("Count of", i, "=", len(Y))
                else:
                    print("Count of", i, "=", 0)
            if metric == "gain_ratio":
                print("Current Entropy is =  0.0")
            elif metric == "gini_index":
                print("Current Gini Index is =  0.0")

            print("Reached leaf Node")
            print()
            return TreeNode(None, output)

        # If we have run out of features to split upon
        # In this case we will output the class with maximum count
        if len(features) == 0:
            print("Level", level)
            freq_map = self.__count_unique(Y)
            output = None
            max_count = -math.inf
            for i in classes:
                if i not in freq_map:
                    print("Count of", i, "=", 0)
                else:
                    if freq_map[i] > max_count:
                        output = i
                        max_count = freq_map[i]
                    print("Count of", i, "=", freq_map[i])

            if metric == "gain_ratio":
                print("Current Entropy  is =", self.__entropy(Y))
            elif metric == "gini_index":
                print("Current Gini Index is =", self.__gini_index(Y))

            print("Reached leaf Node")
            print()
            return TreeNode(None, output)

        # Finding the best feature to split upon
        max_gain = -math.inf
        final_feature = None
        for f in features:
            if metric == "gain_ratio":
                current_gain = self.__gain_ratio(X, Y, f)
            elif metric == "gini_index":
                current_gain = self.__gini_gain(X, Y, f)

            if current_gain > max_gain:
                max_gain = current_gain
                final_feature = f

        print("Level", level)
        freq_map = self.__count_unique(Y)
        output = None
        max_count = -math.inf

        for i in classes:
            if i not in freq_map:
                print("Count of", i, "=", 0)
            else:
                if freq_map[i] > max_count:
                    output = i
                    max_count = freq_map[i]
                print("Count of", i, "=", freq_map[i])

        if metric == "gain_ratio":
            print("Current Entropy is =", self.__entropy(Y))
            print("Splitting on feature  X[",
                  final_feature,
                  "] with gain ratio ",
                  max_gain,
                  sep="")
            print()
        elif metric == "gini_index":
            print("Current Gini Index is =", self.__gini_index(Y))
            print("Splitting on feature  X[",
                  final_feature,
                  "] with gini gain ",
                  max_gain,
                  sep="")
            print()

        unique_values = set(
            X[:, final_feature]
        )  # unique_values represents the unique values of the feature selected
        df = pd.DataFrame(X)
        # Adding Y values as the last column in the dataframe
        df[df.shape[1]] = Y

        current_node = TreeNode(final_feature, output)

        # Now removing the selected feature from the list as we do not want to split on one feature more than once(in a given root to leaf node path)
        index = features.index(final_feature)
        features.remove(final_feature)
        for i in unique_values:
            # Creating a new dataframe with value of selected feature = i
            df1 = df[df[final_feature] == i]
            # Segregating the X and Y values and recursively calling on the splits
            node = self.__decision_tree(df1.iloc[:, 0:df1.shape[1] - 1].values,
                                        df1.iloc[:, df1.shape[1] - 1].values,
                                        features, level + 1, metric, classes)
            current_node.add_child(i, node)

        # Add the removed feature
        features.insert(index, final_feature)

        return current_node
Example #3
0
    def __decision_tree(self,X,Y,features,level,metric,classes):
        # returns the root of the Decision Tree(which consists of TreeNodes) built after fitting the training data
        # Here Nodes are printed as in PREORDER traversl
        # classes represents the different classes present in the classification problem 
        # metric can take value gain_ratio or gini_index
        # level represents depth of the tree
        # We split a node on a particular feature only once (in a given root to leaf node path)
        
        
        # If the node consists of only 1 class
        if len(set(Y)) == 1:
            print("Level",level)
            output = None
            for i in classes:
                if i in Y:
                    output = i
                    print("Count of",i,"=",len(Y))
                else :
                    print("Count of",i,"=",0)
            if metric == "gain_ratio":
                print("Current Entropy is =  0.0")
            elif metric == "gini_index":
                print("Current Gini Index is =  0.0")

            print("Reached leaf Node")
            print()
            return TreeNode(None,output)

        # If we have run out of features to split upon
        # In this case we will output the class with maximum count
        if len(features) == 0:
            print("Level",level)
            freq_map = self.__count_unique(Y)
            output = None
            max_count = -math.inf
            for i in classes:
                if i not in freq_map:
                    print("Count of",i,"=",0)
                else :
                    if freq_map[i] > max_count :
                        output = i
                        max_count = freq_map[i]
                    print("Count of",i,"=",freq_map[i])

            if metric == "gain_ratio":
                print("Current Entropy  is =",self.__entropy(Y))
            elif metric == "gini_index":
                print("Current Gini Index is =",self.__gini_index(Y))            

            print("Reached leaf Node")
            print()
            return TreeNode(None,output)

        
        # Finding the best feature to split upon
        max_gain = -math.inf
        final_feature = None
        for f in features :
            if metric == "gain_ratio":
                current_gain = self.__gain_ratio(X,Y,f)
            elif metric =="gini_index":
                current_gain = self.__gini_gain(X,Y,f)

            if current_gain > max_gain:
                max_gain = current_gain
                final_feature = f

        print("Level",level)
        freq_map = self.__count_unique(Y)
        output = None
        max_count = -math.inf

        for i in classes:
            if i not in freq_map:
                print("Count of",i,"=",0)
            else :
                if freq_map[i] > max_count :
                    output = i
                    max_count = freq_map[i]
                print("Count of",i,"=",freq_map[i])

        if metric == "gain_ratio" :        
            print("Current Entropy is =",self.__entropy(Y))
            print("Splitting on feature  X[",final_feature,"] with gain ratio ",max_gain,sep="")
            print()
        elif metric == "gini_index":
            print("Current Gini Index is =",self.__gini_index(Y))
            print("Splitting on feature  X[",final_feature,"] with gini gain ",max_gain,sep="")
            print()

            
        unique_values = set(X[:,final_feature]) # unique_values represents the unique values of the feature selected
        df = pd.DataFrame(X)
        # Adding Y values as the last column in the dataframe
        df[df.shape[1]] = Y

        current_node = TreeNode(final_feature,output)

        # Now removing the selected feature from the list as we do not want to split on one feature more than once(in a given root to leaf node path)
        index  = features.index(final_feature)
        features.remove(final_feature)
        for i in unique_values:
            # Creating a new dataframe with value of selected feature = i
            df1 = df[df[final_feature] == i]
            # Segregating the X and Y values and recursively calling on the splits
            node = self.__decision_tree(df1.iloc[:,0:df1.shape[1]-1].values,df1.iloc[:,df1.shape[1]-1].values,features,level+1,metric,classes)
            current_node.add_child(i,node)

        # Add the removed feature     
        features.insert(index,final_feature)

        return current_node
Example #4
0
from treenode import TreeNode

#TEST 1
dile = TreeNode("Dile")
edo = TreeNode("Edo")
raffo = TreeNode("Raffo", [dile, edo])
cami = TreeNode("Cami")
stella = TreeNode("Stella", [TreeNode("Jess")])
gabry = TreeNode("Gabry", [TreeNode("Saba"), TreeNode("Luca"), stella])
ale = TreeNode("Ale", [TreeNode("Dave"), gabry, TreeNode("Greg")])
enzo = TreeNode("Enzo", [TreeNode("Diodato")])
fra = TreeNode("Fra", [enzo])
fede = TreeNode("Fede", [ale, fra])

cami.add_child(raffo)
cami.add_child(fede)
peppe = TreeNode(
    "Peppe",
    [TreeNode("Marco"), TreeNode("Gio"),
     TreeNode("Marghe")])
cami.add_child(peppe)
print(cami)
print("\n")

print(cami.give_software())

#TEST 2
tredici = TreeNode("13", [TreeNode("16"), TreeNode("17")])
otto = TreeNode("8", [TreeNode("12"), tredici, TreeNode("14")])
quattro = TreeNode("4", [TreeNode("9"), TreeNode("10")])
sette = TreeNode("7", [TreeNode("11", [TreeNode("15")])])