コード例 #1
0
ファイル: testnode.py プロジェクト: jinyyu/machine-learning
class TestNode(unittest.TestCase):
    def setUp(self):
        self.df = creatData()
        self.node = Node(self.df)

    def test_entropy(self):
        h = self.node._entropy(self.df)
        self.assertTrue(h< 0.98 and h>0.97)  #h=0.971

    def test_conditionalEntropy(self):
        h_age = self.node.conditionalEntropy('age')
        self.assertTrue(h_age > 0.083 and h_age < 0.084) #h_age = 0.083

        h_work = self.node.conditionalEntropy('work')
        self.assertTrue(h_work >0.32 and h_work < 0.33) #h_work = 0.324

        h_house = self.node.conditionalEntropy('house') #h=0.419
        self.assertTrue(h_house>0.41 and h_house<0.42)

        h_credit = self.node.conditionalEntropy('credit') #h=0.363
        self.assertTrue(h_credit > 0.362 and h_credit < 0.364)

    def test_findBestFeature(self):
        feature = self.node.findBestFeature()
        self.assertTrue(feature =='house')

    def test_getDadaFrame(self):
        df = self.node.getDadaFrame('age', 1)
        df2 = self.node.getDadaFrame('age', 2)
コード例 #2
0
ファイル: Tree.py プロジェクト: svencortel/TreeML-API
    def train(self, X_data = None, y_data = None):
        if X_data is None and y_data is None:
            if self.X_data is None:
                raise("No data loaded")
        else:
            self.load(X_data, y_data)

        # init root node and start training
        #print("Training Tree Model. Please Wait...")
        self.root_node = Node(random_feat = self.random, tree=self)

        if self.lookahead:
            self.root_node.train_lookahead(self.X_data, self.y_data, self.max_depth)
        else:
            self.root_node.train(self.X_data, self.y_data, self.max_depth)
コード例 #3
0
def get_random_split(data, is_categorical, n_features):
    # go over all attributes
    classes = list(set(row[-1] for row in data))
    ind, split, best_gini, best_groups = 0, 0, 1, []
    num_features = len(data[0]) - 1
    sub = np.random.choice(range(num_features), n_features, replace=False)
    for att in sub:
        # print('checking ', att)
        att_vals = list(set(row[att] for row in data))
        mini = min(att_vals)
        maxi = max(att_vals)
        if (mini == maxi):
            continue
        if att_vals == [0, 1]:
            for i in [0, 1]:
                groups = binary_split(att, i, data)
                gini = gini_index(groups, classes)
                if gini < best_gini:
                    ind, split, best_gini, best_groups = att, i, gini, groups
            continue
        for i in np.arange(mini, maxi, (maxi - mini) / 100):
            # print(i)
            if att in is_categorical:
                groups = categorical_split(att)
            else:
                groups = binary_split(att, i, data)
            gini = gini_index(groups, classes)
            if gini < best_gini:
                ind, split, best_gini, best_groups = att, i, gini, groups
    return Node(ind, split, best_groups) # might just need attribute and splitting value
コード例 #4
0
 def test_find_best_split(self):
     tree = DecisionTreeRegressor()
     tree.target_class = 'V1'
     node = Node(data=self.data)
     split = tree.find_best_split(node)
     print(split)
     self.assertTupleEqual(('in', 1, ['5', '2', '3']), split[0])
     self.assertAlmostEqual(32.166666666, split[1])
コード例 #5
0
    def test_find_next_partition(self):
        tree = DecisionTreeRegressor(max_leaf_nodes=3)
        tree.target_class = 'V1'
        #(('in', 1, ['5', '2', '3']), 32.16666666666667)
        file = open("data/bank-marketing.arff")
        data = Data(file)
        node = Node(data=data)
        node.is_leaf = True
        tree.root = node
        change, next_node = tree.find_best_node_to_split(node)
        print(change, next_node)
        tree.partition(next_node)
        print(tree.n_leaves)
        tree.root.left_child.data.summary()

        tree.root.right_child.data.summary()
        tree.root.right_child.left_child.data.summary()
        tree.root.right_child.right_child.data.summary()
コード例 #6
0
 def build_tree(self, data, depth):
     # check all the classes
     targets = list(row[-1] for row in data)
     if len(set(targets)) == 1 or depth >= self.max_depth or len(targets) == 0: # pure leaf
         print(depth)
         clas = max(set(targets), key=targets.count)
         return Node(0, 0, [], clas)
     root = get_random_split(data, [], self.n_features)
     for children in root.training_groups: # data in the children of the root
         child_node = self.build_tree(children, depth + 1)
         root.add_child(child_node)
     return root
コード例 #7
0
ファイル: testDataset.py プロジェクト: mathiasj33/DataMining
 def test_train_node(self):
     features = [
         {
             1: 1,
             2: 1,
             3: 1
         },
         {
             1: 1,
             2: 1,
             3: 2
         },
         {
             1: 2,
             2: 1,
             3: 1
         },
         {
             1: 3,
             2: 2,
             3: 1
         },
         {
             1: 3,
             2: 3,
             3: 1
         },
         {
             1: 3,
             2: 3,
             3: 2
         },
     ]
     labels = [0, 0, 1, 1, 1, 0]
     ds = Dataset(features, labels)
     root = Node(ds)
     tree = DecisionTree(max_depth=100)
     root.train([1, 2, 3], 1, 100)
     print(tree)
コード例 #8
0
def test_tree_gen(state, aicolor, depthLim, heuristic):
    def tree_gen(node):
        if (node.depth < node.depthLim):
            node.genNextMoves()
            if (len(node.nextMoves) == 0):
                endStateCheck(node, None)
            else:
                for n in (node.nextMoves):
                    tree_gen(n)

    #pytest.set_trace()
    tempNode = Node(aicolor, state, 0, depthLim, heuristic)
    tree_gen(tempNode)
    assert 1
コード例 #9
0
ファイル: testnode.py プロジェクト: jinyyu/machine-learning
 def setUp(self):
     self.df = creatData()
     self.node = Node(self.df)
コード例 #10
0
# Array math
import numpy as np

# Reading the data
d = pd.read_csv("data/classification/train.csv")[['Age', 'Fare',
                                                  'Survived']].dropna()

# Constructing the X and Y matrices
X = d[['Age', 'Fare']]
Y = d['Survived'].values.tolist()

# Constructing the parameter dict
hp = {'max_depth': 4, 'min_samples_split': 50}

# Initiating the Node
root = Node(Y, X, **hp)

# Getting teh best split
root.grow_tree()

# Using the ML package
clf = DecisionTreeClassifier(**hp)
clf.fit(X, Y)

# Printing out the trees
root.print_tree()
print(export_text(clf, feature_names=['Age', 'Fare']))

# Predictions
X['scikit_learn'] = clf.predict(X[['Age', 'Fare']])
X['custom_yhat'] = root.predict(X[['Age', 'Fare']])
コード例 #11
0
    def fit(self,
            features,
            attributes,
            prev_value,
            label_set,
            current_depth,
            max_depth,
            rand_attribute_size=None):
        """train Id3 decision tree

        :param features: ordered features from dataset
        :type features: python list containing Feature objects
        :param attributes: attributes for current fit iteration
        :type attributes: python tuple containing Attribute objects
        :param prev_value: attribute value of previous adjacent node
        :type prev_value: integer or None
        :param label_set: ordered labels from dataset
        :type label_set: python tuple containing possible integer labels
        :param current_depth: current tree depth
        :type current_depth: integer
        :param max_depth: maximum desired tree depth
        :type max_depth: integer or float
        :param rand_attribute_size: size of desired random attribute subset if not None
        :type rand_attribute_size: integer or None
        :return: root node of decision tree
        :rtype: Node.Node
        """
        if current_depth > self.max_height:
            self.max_height = current_depth

        if current_depth == max_depth:
            label = get_most_common_label(features)
            return Node.Node(None, prev_value, label)

        same_label = 1
        base_label = features[0].get_label()
        for example in features:
            if example.get_label() != base_label:
                same_label = 0
                break

        if same_label == 1:
            return Node.Node(None, prev_value, base_label)

        if len(attributes) == 0:
            label = get_most_common_label(features)
            return Node.Node(None, prev_value, label)

        if rand_attribute_size is not None:
            indices = random.sample(range(0, len(attributes)),
                                    min(rand_attribute_size, len(attributes)))
            random_attributes = []
            for index in indices:
                random_attributes.append(attributes[index])

            attribute_to_split_on = Metrics.get_splitting_attribute(
                features, random_attributes, label_set, self.metric)
        else:
            attribute_to_split_on = Metrics.get_splitting_attribute(
                features, attributes, label_set, self.metric)

        # Make root node
        node = Node.Node(attribute_to_split_on, prev_value, None)

        # Construct S_v
        for attribute_value in attribute_to_split_on.values:
            examples_less_split_attribute = []
            for example in features:
                if example.get_attribute_value(
                        attribute_to_split_on) == attribute_value:
                    examples_less_split_attribute.append(example)

            # If S_v is empty, add leaf node containing most common label of S
            if len(examples_less_split_attribute) == 0:
                node.add_child(
                    Node.Node(None, attribute_value,
                              get_most_common_label(features)))

            else:
                less_attributes = list(copy.deepcopy(attributes))
                less_attributes.remove(attribute_to_split_on)
                node.add_child(
                    self.fit(examples_less_split_attribute, less_attributes,
                             attribute_value, label_set, current_depth + 1,
                             max_depth, rand_attribute_size))

        if prev_value is None:
            self.root = node
        else:
            return node
コード例 #12
0
ファイル: Tree.py プロジェクト: svencortel/TreeML-API
class Tree:
    """The structure of the tree"""
    def __init__(self, criterion = "entropy", max_depth = None, lookahead = False,
                 random_feat=False, PFSRT=False, omega = 1.9, theta = 0.9):
        if random_feat and lookahead:
            raise Exception("random and lookahead cannot coexist in the same tree")
        if PFSRT and lookahead:
            raise Exception("random and PFSRT cannot coexist in the same tree")

        self.criterion = criterion
        self.max_depth = max_depth
        self.lookahead = lookahead
        self.X_data = None
        self.y_data = None
        self.root_node = None
        self.random = random_feat
        self.nr_features = 0
        self._nr_examples = 0

        # PFSRT variables
        self.is_PFSRT = PFSRT
        self._best_accuracy = 0
        self._cur_accuracy = 0
        self.omega = omega # reward
        self.theta = theta # punish

        # Probabilistic Feature Selection Random Tree (PFSRT)
        # DS = Depth Score
        # PS = Prior Score
        self.DS = None
        self.PS = None

    def load(self, X_data, y_data):
        #print(X_data)
        self.X_data = X_data
        self.y_data = y_data
        self.nr_features = X_data.shape[1]
        self._nr_examples = X_data.shape[0]
        # updated data means resetting PFSRT
        self._best_accuracy = 0
        self._cur_accuracy = 0
        if self.is_PFSRT:
            self.DS = np.ones((self.nr_features, self.max_depth))
            self.PS = np.ones((self.nr_features, self.nr_features+1))

    def fit(self, X_data, y_data):
        self.train(X_data, y_data)

    def train(self, X_data = None, y_data = None):
        if X_data is None and y_data is None:
            if self.X_data is None:
                raise("No data loaded")
        else:
            self.load(X_data, y_data)

        # init root node and start training
        #print("Training Tree Model. Please Wait...")
        self.root_node = Node(random_feat = self.random, tree=self)

        if self.lookahead:
            self.root_node.train_lookahead(self.X_data, self.y_data, self.max_depth)
        else:
            self.root_node.train(self.X_data, self.y_data, self.max_depth)

    def updatePFSRT(self):
        if not self.is_PFSRT:
            raise Exception("Must enable PFSRT=True")
        # test over the training data and update DS and PS
        y_pred = self.predict(self.X_data)
        self._cur_accuracy = accuracy_score(y_pred, self.y_data)
        # update DS and PS
        self.root_node.recursiveUpdatePFSRT()
        if self._cur_accuracy > self._best_accuracy:
            self._best_accuracy = self._cur_accuracy

    def predict(self, X_data):
        result = []
        for i in X_data:
            result.append(self.root_node.predictData(i))
        return np.array(result)

    def isBinaryClassifier(self):
        return len(np.unique(self.y_data))==2

    def getClassProb(self, X_data):
        if not self.isBinaryClassifier():
            raise Exception("classification must be binary for getClassProb")
        result = []
        for i in X_data:
            result.append(self.root_node.getPositiveProb(i))
        return result

    def printTree(self):
        self.root_node.printNode()