コード例 #1
0
 def build(self, X, Y, selected):
     cur = self.Node(None, Y)
     if self.verbose:
         print("Cur selected columns:", selected)
         print("Cur data:")
         pprint(X)
         print(Y)
     split = False
     # check if there is no attribute to choose
     # or there is no need for spilt
     if len(selected) != self.column_cnt and len(set(Y)) > 1:
         left_columns = list(set(range(self.column_cnt)) - selected)
         col_ind, best_information_gain = argmax(
             left_columns, key=lambda col: information_gain(X, Y, col))
         col = left_columns[col_ind]
         # if this split is better than not splitting
         if best_information_gain > self.information_gain_threshold:
             if self.verbose:
                 print(f"Split by {col}th column")
             split = True
             cur.col = col
             for val in set(x[col] for x in X):
                 ind = [x[col] == val for x in X]
                 child_X = [x for i, x in zip(ind, X) if i]
                 child_Y = [y for i, y in zip(ind, Y) if i]
                 cur.children[val] = self.build(child_X, child_Y,
                                                selected | {col})
     if not split and self.verbose:
         print("No split")
     return cur
コード例 #2
0
def create_tree(data, attributeList, depth):
    print "Pre-order Traversal Depth:", depth, "NumberOfDataPoints", len(data)
    reviews = [ review[1] for review in data]
    sumOfReviews = sum(reviews)


    if depth == 19 or attributeList == None:
        leaf = DecisionTree()
        leaf.node_label = 1 if sumOfReviews > len(reviews) / 2 else 0

        return leaf

    if sumOfReviews == len(reviews):
        leaf = DecisionTree()
        leaf.node_label = 1

        return leaf 

    if sumOfReviews == 0:
        leaf = DecisionTree()
        leaf.node_label = 0

        return leaf 

    maxInfoGain = 0
    maxInfoGainIndex = -1
    for i, attribute in enumerate(attributeList):
        withAttribute = []
        withOutAttribute = []
        infoGain = 0
        for datum in data:
            if attribute in datum[0]:
                withAttribute.append(datum[1])
            else:
                withOutAttribute.append(datum[1])
        if withAttribute == [] or withOutAttribute == []:
            infoGain = 0
        else:
            infoGain = utils.information_gain(reviews, withAttribute, withOutAttribute)
        if maxInfoGain < infoGain:
            maxInfoGain = infoGain
            maxInfoGainIndex = i
    leftChildReviews = []
    rightChildReviews = []
    for review in data:
        if attributeList[maxInfoGainIndex] in review[0]:
            leftChildReviews.append(review)
        else:
            rightChildReviews.append(review)

    node = DecisionTree()
    node.node_word = attributeList[maxInfoGainIndex]
    attributeList.remove(attributeList[maxInfoGainIndex])
    node.left = create_tree(leftChildReviews, attributeList, depth + 1)
    node.right = create_tree(rightChildReviews, attributeList, depth + 1)

    return node
コード例 #3
0
def create_decision_tree (data, attributes):

    root = DecisionTree()

    # Conditions for termination. check() takes care of cases where all labels are the same, or when attributes are 0
    checkint = 0
    checkint = check(data,attributes)
    if (checkint == 0):
        root.node_label = 0
        print root.node_label
        return root
    elif (checkint == 1):
        root.node_label = 1
        print root.node_label
        return root
    
    # Get attribute with maximum information gain
    infogain = 0.0
    attribute = ''
    for ele in attributes:
        #print ele
        newinfogain = utils.information_gain(data, ele)
        #print newinfogain
        if (newinfogain > infogain):
            infogain = newinfogain
            #print infogain
            attribute = ele

    # Check if none of the attributes divide the data. If so, return root by majority polling
    if (infogain == 0):
        checkint = check(data,attribute)
        root.node_label = checkint
        print root.node_label
        return root

    root.value = attribute
    print root.value

    #Divide dataset into left and right
    data_left, data_right = divide_dataset (data, attribute)
    attributes.remove(attribute)
    #print attributes

    #Recurse
    root.left = create_decision_tree (data_left, attributes)
    root.right = create_decision_tree (data_right, attributes)

    return root
コード例 #4
0
def find_split(x, y, feature_indices):
    best_gain = 0
    best_feature_index = 0
    best_threshold = 0
    for feature_index in feature_indices:
        values = sorted(set(x[:, feature_index]))
        for j in range(len(values) - 1):
            threshold = (values[j] + values[j + 1]) / 2
            x_true, y_true, x_false, y_false = split(x, y, feature_index,
                                                     threshold)
            gain = information_gain(y, y_true, y_false)
            if gain > best_gain:
                best_gain = gain
                best_feature_index = feature_index
                best_threshold = threshold
    return best_feature_index, best_threshold
コード例 #5
0
ファイル: __init__.py プロジェクト: omartrinidad/schiffsdiebe
    def __tdidt(self, samples, atts, node):
        """
        """
        labels = self.training[:,-1]
        # first
        best_gain = ('', 0)

        # calculate entropy for continuous valued attributes
        for col, sample in enumerate(self.training.T[:-1]):
            # node attributes[col]
            # calculate information gain for each column
            ig = information_gain(sample, labels)

            if ig[1] > best_gain[1]:
                best_gain = (self.attributes[col], ig[1])

        left_node = Knoten()
        right_node = Knoten()

        # recursive call
        self.__tdidt(samples, atts, left_node)
        self.__tdidt(samples, atts, rigth_node)
        pass
コード例 #6
0
    def find_split(self, X, y, feature_indices, weights):

        best_gain = -float('inf')
        best_feature_index = -1
        best_value = [0]

        # for each feature to be considered
        for feature_index in sorted(feature_indices):
            # get rows of instances with known values for the feature
            not_nan_rows = [
                a for a in range(X.shape[0])
                if not utils.isnan(X[:, feature_index][a])
            ]

            Xnotnan = (X[not_nan_rows, :])
            ynotnan = y[not_nan_rows]

            #if there aren't any instances with known values for the feature, go to the next one
            if (Xnotnan.shape[0] == 0):
                continue

            # get all possible values for the feature index
            values = sorted(set(Xnotnan[:, feature_index]))

            # if the values are numeric
            if (utils.isnum(Xnotnan[0, feature_index])):

                # split the data using each value
                for j in range(len(values) - 1):

                    #value = (float(values[j]) + float(values[j+1]))/2 -- original
                    value = values[j]
                    # split data using the feature and the value
                    Xs, ys, d = utils.split_num(Xnotnan, ynotnan,
                                                feature_index, value)
                    # calculate gain considering the rate of missing values.
                    # the bigger the rate, the smaller the gain
                    gain = (len(ynotnan) / len(y)) * utils.information_gain(
                        ynotnan, ys)

                    if gain >= best_gain:
                        # if there's a tie on info gain, decide using gain ratio
                        # if(gain == best_gain and best_feature_index != -1):
                        #     print('tie of gain')
                        #     gr = utils.gain_ratio(ynotnan,ys,y)
                        #     not_nan_rows = [a for a in range(X.shape[0]) if not utils.isnan(X[:,best_feature_index][a])]
                        #     Xss,yss, ds = utils.split(X[not_nan_rows,:],y[not_nan_rows],best_feature_index,best_value)
                        #     # calculate gain ratio of previous best feature to compare
                        #     gr_p = utils.gain_ratio(ynotnan,yss,y)
                        #     # if the current feature's gain ratio is not better than the previous one, then
                        #     # go to the next feature
                        #     if(gr < gr_p):
                        #         continue

                        best_gain = gain
                        best_feature_index = feature_index
                        best_value = [
                            values[j]
                        ]  #c4.5 choses the largest value in the trainig set that
                        #does not exceed the midpoint (value). This ensures that all
                        #threshold values appearing in trees actually occur in the data
            # if the values are categorical
            else:
                # split the data using the values
                Xs, ys, d = utils.split_categ(Xnotnan, ynotnan, feature_index,
                                              values)

                gain = ((len(ynotnan) / len(y)) *
                        utils.information_gain(ynotnan, ys)
                        )  #utils.gain_ratio(ynotnan,ys,y))

                if gain >= best_gain:
                    # if(gain == best_gain and best_feature_index != -1):
                    #     print('tie of gain')
                    #     gr = utils.gain_ratio(ynotnan,ys,y)
                    #     not_nan_rows = [a for a in range(X.shape[0]) if not utils.isnan(X[:,best_feature_index][a])]
                    #     Xss,yss, ds = utils.split(X[not_nan_rows,:],y[not_nan_rows],best_feature_index,best_value)
                    #     gr_p = utils.gain_ratio(ynotnan,yss,y)
                    #     if(gr < gr_p):
                    #         continue

                    best_gain = gain
                    best_feature_index = feature_index
                    best_value = values

        return best_feature_index, best_value
コード例 #7
0
    def fit(self, X, y):
        # Train the decision tree (self.tree) using the the sample X and labels y
        # X should be 2D numpy array NxM, N is the number of instances,
        # M is the number of features.
        # y should be 1D numpy array N, N is the number of instances.
        # max_depth represents the maximum depth of the tree
        # min_gain represents the minimum information gain
        # key "left" and "right" represent the left child and right child

        X = np.asarray(X).astype(float)
        y = np.asarray(y).astype(int)

        #current feature set is empty
        if (self.tree['current_features'].shape[0] == 0):
            self.tree['label'] = np.argmax(np.bincount(y))
            return

        #All instances are same class
        if (len(set(y)) == 1):
            self.tree['label'] = y[0]
            return

        #Reach max_depth
        if ((self.tree['max_depth'] > 0)
                and (self.tree['depth'] == self.tree['max_depth'])):
            self.tree['label'] = np.argmax(np.bincount(y))
            return

        current_features = self.tree['current_features']
        max_information_gain_list = []
        max_split_val_list = []

        for split_attribute in current_features:
            X_select = list(set([x[split_attribute] for x in X]))
            max_information_gain = 0
            max_split_val = X_select[0]

            for split_val in X_select:
                (_, _, y_left,
                 y_right) = partition_classes(X, y, split_attribute, split_val)
                current_information_gain = information_gain(
                    y, [y_left, y_right])

                if (current_information_gain > max_information_gain):
                    max_information_gain = current_information_gain
                    max_split_val = split_val

            max_information_gain_list.append(max_information_gain)
            max_split_val_list.append(max_split_val)

        #index of split_attribute in current features
        index = np.argmax(max_information_gain_list)

        #information gain is less than threshold
        if (max_information_gain_list[index] <= self.tree['min_gain']):
            self.tree['label'] = np.argmax(np.bincount(y))
            return

        self.tree['split_attribute'] = current_features[index]
        self.tree['split_val'] = max_split_val_list[index]

        #split node
        (X_left, X_right, y_left, y_right) = partition_classes(X, y, \
        self.tree['split_attribute'], self.tree['split_val'])

        left_tree = ID3(X_left, y_left, self.tree['max_depth'],
                        self.tree['min_gain'])
        right_tree = ID3(X_right, y_right, self.tree['max_depth'],
                         self.tree['min_gain'])

        current_features = np.delete(current_features, index)
        left_tree.tree['current_features'] = current_features
        right_tree.tree['current_features'] = current_features

        left_tree.tree['depth'] = self.tree['depth'] + 1
        right_tree.tree['depth'] = self.tree['depth'] + 1

        left_tree.fit(X_left, y_left)
        right_tree.fit(X_right, y_right)

        self.tree['left'] = left_tree
        self.tree['right'] = right_tree

        return
from utils import information_gain, entropy
from collections import Counter
from math import fabs

eps = 1e-3

X = [
    ['青年', '否', '否', '一般'],
    ['青年', '否', '否', '好'],
    ['青年', '是', '否', '好'],
    ['青年', '是', '是', '一般'],
    ['青年', '否', '否', '一般'],
    ['中年', '否', '否', '一般'],
    ['中年', '否', '否', '好'],
    ['中年', '是', '是', '好'],
    ['中年', '否', '是', '非常好'],
    ['中年', '否', '是', '非常好'],
    ['老年', '否', '是', '非常好'],
    ['老年', '否', '是', '好'],
    ['老年', '是', '否', '好'],
    ['老年', '是', '否', '非常好'],
    ['老年', '否', '否', '一般'],
]
Y = ['否', '否', '是', '是', '否', '否', '否', '是', '是', '是', '是', '是', '是', '是', '否']

assert(fabs(entropy(Counter(Y).values()) - .971) < eps)
assert(fabs(information_gain(X, Y, 0) - .083) < eps)
assert(fabs(information_gain(X, Y, 1) - .324) < eps)
assert(fabs(information_gain(X, Y, 2) - .420) < eps)
assert(fabs(information_gain(X, Y, 3) - .363) < eps)
コード例 #9
0
from utils import information_gain, entropy
from collections import Counter

eps = 1e-3

X = [
    ['青年', '否', '否', '一般'],
    ['青年', '否', '否', '好'],
    ['青年', '是', '否', '好'],
    ['青年', '是', '是', '一般'],
    ['青年', '否', '否', '一般'],
    ['老年', '否', '否', '一般'],
    ['老年', '否', '否', '好'],
    ['老年', '是', '是', '好'],
    ['老年', '否', '是', '非常好'],
    ['老年', '否', '是', '非常好'],
    ['老年', '否', '是', '非常好'],
    ['老年', '否', '是', '好'],
    ['老年', '是', '否', '好'],
    ['老年', '是', '否', '非常好'],
    ['老年', '否', '否', '一般'],
]
Y = ['否', '否', '是', '是', '否', '否', '否', '是', '是', '是', '是', '是', '是', '是', '否']

assert (entropy(Counter(Y).values()) - .971 < eps)
assert (information_gain(X, Y, 0) - .083 < eps)
assert (information_gain(X, Y, 1) - .324 < eps)
assert (information_gain(X, Y, 2) - .420 < eps)
assert (information_gain(X, Y, 3) - .363 < eps)
コード例 #10
0
     ['OVERCAST', 81, 75, 'F', 'PLAY'], ['RAIN', 71, 80, 'T', "DON'T PLAY"],
     ['RAIN', 65, 70, 'T', "DON'T PLAY"], ['RAIN', 75, 80, 'F', 'PLAY'],
     ['RAIN', 68, 80, 'F', 'PLAY'], ['RAIN', 70, 96, 'F', 'PLAY']],
    dtype='object',
    columns=original_attributes)

X = data[data.columns[:-1]].values
y = data['Class'].values
print('Testing entropy, information gain, gain ratio...')
assert (utils.entropy([1, 0, 0, 1, 0, 1]) == 1)
assert (utils.entropy([1, 1, 1]) == 0)
assert (utils.entropy([0]) == 0)
outlook_index = np.where(original_attributes == 'Outlook')[0][0]
Xs, ys, d = utils.split_categ(X, y, outlook_index,
                              list(set(X[:, outlook_index])))
assert (np.isclose(utils.information_gain(y, ys), 0.246, rtol=1e-2))
assert (np.isclose(utils.gain_ratio(y, ys, y), 0.156, rtol=1e-2))

print('Testing gini index...')
assert (utils.gini_impurity([1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0]) == 0.5)
assert (utils.gini_impurity([0, 0, 0, 0, 0]) == 0)
print('Testing gini...')
assert (utils.gini([0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1],
                   [[0, 1, 0, 0], [1, 1, 0, 0, 1, 1, 0, 1]]) == 0.0625)

print('Testing Decision Tree...')
m = dt.DecisionTreeClassifier(missing_branch=False)
m.fit(X, y)
m.to_pdf(original_attributes, out='tree1.pdf')
assert (m.predict((['OVERCAST', 80, 90, 'T'])) == 'PLAY')
assert (m.predict(['RAIN', 80, 50, 'F']) == 'PLAY')