def train(self, X):
     train_x = self.cross_validation_split(X, self.n_trees, self.fold_size)
     train_x = self.randomize_features(train_x)
     for fold in train_x:
         dt = DecisionTree(MAX_DEPTH, MIN_NODE)
         dt.train(fold)
         self.trees.append(dt)
Beispiel #2
0
 def algorithm(self, i):
     # Prepares bootstrap data according to given bootstrap_size
     bootstrap_data_x, bootstrap_data_y = self.get_bootstrap_data()
     tree = DecisionTree(max_depth=self.max_depth,
                         random_subspace=self.n_features)
     tree.fit(bootstrap_data_x, bootstrap_data_y)
     return tree
Beispiel #3
0
def train(learning_curve: list[float], output_file: str = 'data/saved_trees/AIBasic.csv') -> None:
    results = []
    d_tree = DecisionTree(move=-1, turn='yellow', subtrees=[])
    random_player = RandomPlayer()
    for t in learning_curve:
        ai = AIPlayerBasic(d_tree, t)
        moves_played, ai_win = run_game(ai, random_player)
        if ai_win:
            moves_played.append(1)
        else:
            moves_played.append(0)

        d_tree.add_game(moves_played)

        results.append(ai_win)

    write_to_file(d_tree, output_file)

    total_win_percent = len([1 for result in results if result])/len(results)

    recent_wins = 0
    if len(results) > 100:
        flipped_results = results[::-1]
        for i in range(0, 100):
            if flipped_results[i]:
                recent_wins += 1

    recent_win_percent = recent_wins/100

    print('Recent Win Percentage:', recent_win_percent)
    print('Total Win Percentage:', total_win_percent)
class Emsemble:
    def __init__(self, max_depth, models, stump_class=DecisionStumpErrorRate):
        self.max_depth = max_depth
        self.stump_class = stump_class
        self.models = models
        self.tree_mod = DecisionTree(max_depth, stump_class)

    def fit(self, X, y):
        # Fits a decision tree using greedy recursive splitting
        N, D = X.shape
        L = len(self.models)

        Xnew = np.zeros((N, L))
        cur_col_ind = 0
        for model in self.models:
            y = model.predict(X)
            Xnew[:, cur_col_ind] = y
            cur_col_ind = cur_col_ind

        self.tree_mod.fit(Xnew, y, self.max_depth, self.stump_class)

    def predict(self, X):

        y = self.tree_mod.pred(X)

        return y
Beispiel #5
0
def main(argc, argv):
    """ entry point to the program """

    if argc < 3 or argc > 4:
        sys.exit(
            f"Usage python3 {argv[0]} <training_file> <output_dir> <random_features?>"
        )

    _, training_y, training_x = parse_data.read_data(argv[1],
                                                     skip_header=False,
                                                     delimiter=",")

    random_features = None
    if argc >= 4:
        random_features = int(argv[3])

    num_rows = len(training_y)
    while True:
        tree = DecisionTree()

        rows_to_evaluate = random.choices(range(num_rows), k=num_rows)
        tree.train(rows_to_evaluate,
                   training_x,
                   training_y,
                   random_features=random_features)

        filename = f"{argv[2]}/{uuid.uuid4()}.json"
        with open(filename, "w") as out_file:
            out_file.write(tree.to_json())
        print(filename)
def main():
    """
    main function
    """
    # build corpus
    with open("names/female.txt") as textfile:
        females = textfile.readlines()
        females = [name.strip() for name in females]

    with open("names/male.txt") as textfile:
        males = textfile.readlines()
        males = [name.strip() for name in males]

    female_features = [gender_features(name) for name in females]
    male_features = [gender_features(name) for name in males]
    shuffle(female_features)
    shuffle(male_features)

    train_set = [(feature, "female") for feature in female_features[:4500]] + \
        [(feature, "male") for feature in male_features[:2500]]
    test_set = [(feature, "female") for feature in female_features[4500:]] + \
        [(feature, "male") for feature in male_features[2500:]]

    # feed corpus into the tree!
    tree = DecisionTree(train_set)
    print "Trained decision tree (with ID3 heuristic) using {} samples." \
        .format(len(train_set))
    print "Evaluating accuracy with a test set of {} samples..." \
        .format(len(test_set))
    accuracy, nones = tree.evaluate(test_set)
    print "Percent of items classified correctly: {}%" \
        .format(round(accuracy * 100, 2))
    print "Percent of items not classified: {}%" \
        .format(round(nones * 100), 2)
 def test_decision_tree(self):
     decision_tree = DecisionTree()
     decision_tree.train(self.sample_input, self.sample_output, feature_label=self.feature_labels)
     self.assertEqual(decision_tree.root.value, "no surfacing")
     test_input = [1, 1]
     test_output = decision_tree.predict(test_input)
     self.assertEqual(test_output, "yes")
    def train(self, data, labels):
        """
        Trains the Random Forest by creating and training its constituent trees.

        Parameters
        ----------
        data : np.array
            An (n, d) numpy matrix with numerical (float or int) entries. Each row represents a datapoint.
        labels : np.array
            An (n,) numpy array. Each entry represents the label for its corresponding datapoint.
        """
        n, f = data.shape
        num_features = int(self.fraction_features * f)
        num_datapoints = int(self.fraction_data * n)
        k = np.max(labels) + 1
        for i in range(self.num_trees):
            subset_of_data_indices = np.random.choice(n,
                                                      num_datapoints,
                                                      replace=True)
            data_for_tree, labels_for_tree = data[
                subset_of_data_indices], labels[subset_of_data_indices]

            ignore_feature_indices = set(
                np.random.choice(f, f - num_features, replace=False))
            tree = DecisionTree(max_depth=self.max_depth,
                                ignore_feature_indices=ignore_feature_indices,
                                cat_feature_indices=self.cat_feature_indices,
                                max_num_thresholds=self.max_num_thresholds)
            tree.train(data_for_tree, labels_for_tree, None, k)
            self.trees.append(tree)
Beispiel #9
0
    def fit_trees(self, i):
        """
        Applying Bootstrap sampling and balancing different classes and creating a new
        Decision tree classifier and fitting the training data and adds the reference to a dictionary
        :param i:unique id
        :return:
        """
        bag_col_sample = self.training_df.sample(frac=.95, replace=False, random_state=int(time.time()), axis=1)

        balanced_classes = pd.Series([], name='Response', dtype='int32')

        for col_val in self.training_result.unique():

            if self.training_result[self.training_result == col_val].count() >= 1900:
                balanced_classes = balanced_classes.append(self.training_result[self.training_result == col_val].sample(
                    n=1900, random_state=int(time.time()), replace=False, axis=0))
            else:
                balanced_classes = balanced_classes.append(self.training_result[self.training_result == col_val].sample(
                    n=1200, random_state=int(time.time()), replace=True, axis=0))

        bag = bag_col_sample.join(balanced_classes, how='inner')

        clf = DecisionTree(i, bag, self.min_samples_split, self.max_depth)
        clf.fit()
        self.tree_dict[i] = clf
Beispiel #10
0
def test_resultant_tree():
    example_dataset = pd.read_csv('data/benchmark_dataset.tsv', sep='\t')
    decision_tree = DecisionTree(classification_attribute='Joga',
                                 attribute_types={
                                     'Tempo': 'discrete',
                                     'Temperatura': 'discrete',
                                     'Umidade': 'discrete',
                                     'Ventoso': 'discrete'
                                 })
    decision_tree.train(example_dataset)
    expected_tree = json.dumps({
        "('Tempo', 0.247)": {
            "children": [{
                "('Chuvoso', 'Ventoso', 0.971)": {
                    "children": ["('Falso', 'Sim')", "('Verdadeiro', 'Nao')"]
                }
            }, {
                "('Ensolarado', 'Umidade', 0.971)": {
                    "children": ["('Alta', 'Nao')", "('Normal', 'Sim')"]
                }
            }, "('Nublado', 'Sim')"]
        }
    })

    assert decision_tree.to_json() == expected_tree
Beispiel #11
0
    def run_iteration(self):
        weak_learner = DecisionTree(self.dataset, self.depth)
        weak_learner.build()

        e = 0
        errors = []
        for row in self.dataset:
            if weak_learner.predict(row) != row[-2]:
                e += row[-1]
                errors.append(1)
            else:
                errors.append(0)

        alpha = 0.5 * log((1 - e) / e)
        # print 'e=%.2f a=%.2f'%(e, alpha)

        sum_weights = 0
        for i in range(len(self.dataset)):
            row = self.dataset[i]
            if errors[i] == 1: row[-1] = row[-1] * exp(alpha)
            else: row[-1] = row[-1] * exp(-alpha)
            sum_weights += row[-1]

        for row in self.dataset:
            row[-1] /= sum_weights

        self.weak_learners.append(weak_learner)
        self.alpha.append(alpha)
Beispiel #12
0
 def test_fit_predict_classification(self):
     decision_tree = DecisionTree()
     data = np.array([[1], [2], [3]])
     labels = np.array([0, 0, 1])
     decision_tree.fit(data, labels)
     pred = decision_tree.predict([3.5])
     self.assertIn(pred, labels)
Beispiel #13
0
 def test_fit_predict_regression(self):
     decision_tree = DecisionTree(task="regression")
     data = np.array([[1], [2], [3]])
     labels = [0.5, 0.25, 1.5]
     decision_tree.fit(data, np.array(labels))
     pred = decision_tree.predict([2.5])
     self.assertTrue(isinstance(pred, float))
Beispiel #14
0
 def fit(self,
         X,
         y,
         n_estimators=10,
         max_depth=3,
         min_samples_split=2,
         max_features=None,
         n_samples=None):
     self.trees = []
     self.tree_features = []
     for _ in range(n_estimators):
         m = len(X[0])
         n = len(y)
         if n_samples:
             idx = choices(population=range(n), k=min(n, n_samples))
         else:
             idx = range(n)
         if max_features:
             n_features = min(m, max_features)
         else:
             n_features = int(m**0.5)
         features = sample(range(m), choice(range(1, n_features + 1)))
         X_sub = [[X[i][j] for j in features] for i in idx]
         y_sub = [y[i] for i in idx]
         clf = DecisionTree()
         clf.fit(X_sub, y_sub, max_depth, min_samples_split)
         self.trees.append(clf)
         self.tree_features.append(features)
def run():
    # Column labels.
    # These are used only to print the tree.
    headers = [
        "Class Name", "Left-Weight", "Left-Distance", "Right-Weight",
        "Right-Distance"
    ]

    # The Balance Scale Weight & Distance Database
    # Format: each row is a new data.
    # The first column is the label.
    # The last four columns are features.
    training_data = read_data()

    decision_tree = DecisionTree(headers, training_data)

    balance_scale_tree = decision_tree.get_tree()

    print_tree(balance_scale_tree)

    # Evaluate
    # testing_data = read_data()
    testing_data = [['L', 1, 3, 1, 2], ['B', 2, 3, 3, 2], ['R', 2, 5, 4, 4],
                    ['R', 4, 1, 5, 5], ['L', 5, 3, 1, 1]]

    for row in testing_data:
        print('Actual:', row[0], ', Predicted:',
              print_leaf(classify(row, balance_scale_tree)))

    print('\nTotal accuracy:', print_accuracy(testing_data,
                                              balance_scale_tree))
Beispiel #16
0
	def __init__(self, X, count, depth, neg, pos, ssf=False, numfeatures=35):
		self.ssf = ssf
		if (self.ssf == False):
			self.trees = [ DecisionTree( get_subsample(X,neg,pos).tolist() , depth) for _ in xrange(count) ]
		else:
			self.feature_subset =  [ np.random.choice([i for i in xrange(len(X[0])-1)], numfeatures).tolist()+[35] for _ in xrange(count) ]
			self.trees = [DecisionTree( np.asarray(get_subsample([[point[i] for i in fs] for point in X],neg,pos)) , depth) for fs in self.feature_subset ]
Beispiel #17
0
    def fit(self, X, y):
        N = X.shape[0]
        boostrap_inds = np.random.choice(N, N, replace=True)
        bootstrap_X = X[boostrap_inds]
        bootstrap_y = y[boostrap_inds]

        DecisionTree.fit(self, bootstrap_X, bootstrap_y)
Beispiel #18
0
 def fit(self, X, Y, w=None, w_asymmetric=None, depth=1, T=100, **kwargs):
     self.X = X.copy()
     self.Y = Y.copy()
     N = len(self.Y)
     
     if w is None:
         w = (1.0/float(N))*numpy.ones(N)
     if w_asymmetric is None:
         w_asymmetric = (1.0/float(N))*numpy.ones(N)
     self.weights = w.copy()
     self.weights_asymmetric = numpy.array([i**(1.0/float(T)) 
                                                     for i in w_asymmetric])
     self.weights /= float(sum(self.weights))
     self.weak_classifier_ensemble = []
     self.alpha = []
     
     for t in with_progress(range(T), pbar=self.progressbar):
         # Apply asymmetric weights
         self.weights *= self.weights_asymmetric
         weak_learner = DecisionTree().fit(self.X,self.Y,self.weights, depth=depth)
         Y_pred = weak_learner.predict(self.X)
         e = sum(0.5*self.weights*abs(self.Y-Y_pred))/sum(self.weights)
         if e > 0.5:
             logging.warning(' ending training, no good weak classifiers.')
             break
         ee = (1.0-e)/float(e)
         alpha = 0.5*math.log(ee)
         # increase weights for wrongly classified:
         self.weights *= numpy.exp(-alpha*self.Y*Y_pred)
         self.weights /= sum(self.weights)
         self.weak_classifier_ensemble.append(weak_learner)
         self.alpha.append(alpha)
     return self
Beispiel #19
0
class TestDecisionTree(unittest.TestCase):

    def setUp(self):
        self.tree = DecisionTree()
        self.data = [{"male": True, "tall": False, "rich": True, "married": False},
                {"male": False, "tall": True, "rich": True, "married": True},
                {"male": False, "tall": True, "rich": False, "married": True},
                {"male": True, "tall": True, "rich": True, "married": True},
                {"male": False, "tall": True, "rich": False, "married": True}]

    def test_entropy_corectness(self):
        entropy = self.tree.entropy(self.data, "married")
        is_within = entropy > 0.721 and entropy < 0.722
        is_within.should.be.ok

    def test_gain_corectness(self):
        gain = self.tree.gain(self.data, "male", "married")
        is_within = gain > 0.321 and gain < 0.322
        is_within.should.be.ok

    def test_find_best_attribute(self):
        attributes = list(self.data[0].keys())
        attributes.remove("married")
        best_attribute = self.tree.find_best_attribute(self.data,
                attributes, "married")
        best_attribute.should.eql("tall")

    def test_grow_decision_tree(self):
        attributes = list(self.data[0].keys())
        attributes.remove("married")
        root_node = self.tree.grow(self.data, attributes, "married")
        root_node.label.should.eql("tall")
 def test_train_method(self):
     decision_tree = DecisionTree()
     decision_tree.train(self.sample_input,
                         self.sample_output,
                         feature_label=self.feature_labels)
     self.assertIsNotNone(decision_tree.root,
                          'Decision tree must have a root node')
Beispiel #21
0
def training(train_filepath):
    train_data = Instance.read(train_filepath)

    algo = Id3()
    dt = DecisionTree(train_data, algo)
    path = dt.train()
    return path
 def test_choose_feature_to_split(self):
     decision_tree = DecisionTree()
     feature_to_split = decision_tree._select_feature_to_split(
         self.sample_input, self.sample_output)
     self.assertEqual(
         feature_to_split, 0,
         'The best feature index to pick is 0, but get %d' %
         feature_to_split)
 def fit(self, X, y):
     self._classes = np.unique(y)
     # 各決定木にわたすデータは、元データをブートストラップサンプル (復元ありの抽出) および特徴量をランダムに選択
     bootstrapped_X, bootstrapped_y = self._bootstrap_sample(X, y)
     for i, (i_bootstrapped_X, i_bootstrapped_y) in enumerate(zip(bootstrapped_X, bootstrapped_y)):
         tree = DecisionTree()
         tree.fit(i_bootstrapped_X, i_bootstrapped_y)
         self._forest[i] = tree
Beispiel #24
0
    def fitting(self):
        # TODO: Train `num_trees` decision trees using the bootstraps datasets
        # and labels by calling the learn function from your DecisionTree class.

        for i in range(0, self.num_trees):
            decisiontree = DecisionTree()
            self.decision_trees[i] = decisiontree.learn(
                self.bootstraps_datasets[i], self.bootstraps_labels[i])
Beispiel #25
0
 def fit(self, X, y):
     rs = ShuffleSplit(n_splits=self.forest_size, train_size=0.75, random_state=42)
     for train_index, _ in rs.split(X):
         X_cur = X[train_index]
         y_cur = y[train_index]
         tree = DecisionTree(self.min_split, self.min_leaf)
         tree.fit(X_cur, y_cur)
         self.forest.append(tree)
 def fit(self, X, y):
     self.trees = []
     for _ in range(self.n_trees):
         tree = DecisionTree(min_samples_split=self.min_samples_split,
             max_depth=self.max_depth, n_feats=self.n_feats)
         X_samp, y_samp = bootstrap_sample(X, y)
         tree.fit(X_samp, y_samp)
         self.trees.append(tree)
    def test_train_with_gini(self):
        algo = Gini()
        dt1 = DecisionTree(self.instances1, algo)
        path1 = dt1.train()
        assert path1 == self.gt1

        dt2 = DecisionTree(self.instances2, algo)
        path2 = dt2.train()
        assert path2 == self.gt2
 def test_info_gain(self):
     expected_info_gain = 0.14
     question = Question(0, 'Green')
     true_rows, false_rows = DecisionTree._partition(
         self.training_data, question)
     current_uncertainty = DecisionTree._gini(self.training_data)
     info_gain = DecisionTree.info_gain(true_rows, false_rows,
                                        current_uncertainty)
     self.assertAlmostEqual(expected_info_gain, info_gain)
 def test_info_gain_with_better_split(self):
     expected_info_gain = 0.3733333
     question = Question(0, 'Red')
     true_rows, false_rows = DecisionTree._partition(
         self.training_data, question)
     current_uncertainty = DecisionTree._gini(self.training_data)
     info_gain = DecisionTree.info_gain(true_rows, false_rows,
                                        current_uncertainty)
     self.assertAlmostEqual(expected_info_gain, info_gain)
Beispiel #30
0
def build_predict_tree(k, max_depth, m, X, y):
    tree = DecisionTree(sample_features=k,
                        max_depth=max_depth,
                        best_attr_method='gini')
    sample = np.random.choice(X.shape[0], m, replace=True)
    tree.fit(X[sample], y[sample])
    #self.trees.append(tree)
    #D.append(tree.predict(X))
    return tree
Beispiel #31
0
 def train(self, X, y):
     for i in range(self.n_trees):
         X_train, y_train = self.subsample(X, y, self.sample_size)
         X_train = self.drop_features(X_train, self.max_features)
         tree = DecisionTree(max_depth=self.max_depth, split_val_metric=self.split_val_metric,
                             split_node_criterion=self.split_node_criterion, min_info_gain=self.min_info_gain)
         tree.train(X_train, y_train)
         self.trees.append(tree)
     return self
Beispiel #32
0
def problem_12():
    train_data = get_train_data()
    test_data = get_test_data()
    dt = DecisionTree(train_data, None)
    E_in = get_error_rate(train_data[:, -1],
                          dt.predict_all(train_data[:, :-1]))
    E_out = get_error_rate(test_data[:, -1], dt.predict_all(test_data[:, :-1]))
    print('E in: {}'.format(E_in))
    print('E out: {}'.format(E_out))
 def test_decision_tree(self):
     decision_tree = DecisionTree()
     decision_tree.train(self.sample_input,
                         self.sample_output,
                         feature_label=self.feature_labels)
     self.assertEqual(decision_tree.root.value, 'no surfacing')
     test_input = [1, 1]
     test_output = decision_tree.predict(test_input)
     self.assertEqual(test_output, 'yes')
 def fitting(self):
     # TODO: Train `num_trees` decision trees using the bootstraps datasets
     # and labels by calling the learn function from your DecisionTree class.
     for i in range(self.num_trees):
        # print("training tree #", i)
         a_tree = DecisionTree()
         XX = self.bootstraps_datasets[i]
         y = self.bootstraps_labels[i]
         a_tree.learn(XX,y)
         self.decision_trees[i] = a_tree   
Beispiel #35
0
 def train(self, train_data, train_labels):
     for i in range(self.num_trees):
         # data bagging
         sample_idx = np.random.randint(0, train_data.shape[0],
                                        self.sample_size)
         t_data, t_labels = train_data[sample_idx], train_labels[sample_idx]
         # attr bagging
         dtree = DecisionTree(max_depth=self.max_depth)
         dtree.train(t_data, t_labels, attr_bagging_size=self.feature_size)
         self.decision_trees.append(dtree)
    def __init__(self):
        self.instances1 = Instance.read('./data/test1.dat')
        dt = DecisionTree(self.instances1, Id3())
        dpath = dt.train()
        dpath.dump('./data/test1.dat.path')
        self.path1 = DecisionTreeResult.load('./data/test1.dat.path')

        self.instances2 = Instance.read('./data/test2.dat')
        dt = DecisionTree(self.instances2, Id3())
        dpath = dt.train()
        dpath.dump('./data/test2.dat.path')
        self.path2 = DecisionTreeResult.load('./data/test2.dat.path')

        self.dtr = DecisionTreeRefiner()
    def train(self, training_examples, train_on_subset=True, num_trees=100, features_considered_per_node=2, **kwds):
        print "Training a decision forest of %d trees, using %d examples, and %d features considered per node." % (
                num_trees, len(training_examples), features_considered_per_node)
        self.trees = []
        total_test_output_stats = SummaryStats()

        binary_classification = all(example["_OUTPUT"] in [0,1] for example in training_examples)
        #binary_classification = True
        #for example in training_examples:
        #    output = example["_OUTPUT"]
        #    if output not in [0,1]:
        #        binary_classification = False
        #        break

        for tree_i in xrange(1, num_trees+1):
            tree = DecisionTree()
            self.trees.append(tree)

            test_set_ids = set(xrange(len(training_examples)))
            for i in xrange(len(training_examples)):
                if train_on_subset:  # N samples with replacement ("bootstrap")
                    index = random.randint(0, len(training_examples)-1)
                else:
                    index = i

                tree.add_example(training_examples[index])
                test_set_ids.discard(index)

            print "Growing tree %d/%d ..." % (tree_i, num_trees),
            tree.grow_tree(features_considered_per_node=features_considered_per_node)

            # Report the in-sample training error
            if binary_classification:
                print "area-under-curve for %d training examples is %2.2f" % (
                        len(tree.examples), tree.test(tree.examples, print_level=0))
            else:
                print "%2.2f avg err^2 on %d training examples" % (
                        tree.avg_squared_error(), len(tree.examples)),


            # Report the out-of-sample testing error, if we have any out-of-sample
            # examples to test on.
            if train_on_subset:
                print "; ",
                test_set = [training_examples[i] for i in test_set_ids]

                if binary_classification:
                    # Do a true out-of-sample test just on this one tree
                    # Temporarily make this a forest-of-one-tree...
                    save_trees = self.trees
                    self.trees = [tree]
                    self.test(test_set)
                    self.trees = save_trees
                else:
                    avg_squared_error = tree.avg_squared_error(test_set)
                    total_test_output_stats.add(avg_squared_error)

                    print "out-of-sample avg err^2 on %d test cases: %.2f [%.2f avg. for all %d trees so far]" % (len(test_set), avg_squared_error, total_test_output_stats.avg(), tree_i),

            print
class SPDRWorker:
    def __init__(self, bins_count=30):
        self.data = list()
        self.node2indexes = defaultdict(list)
        self.decision_tree = DecisionTree()
        self.histograms = dict()
        self.bins_count = bins_count

    def add_object(self, original_class, features):
        index = len(self.data)
        self.data.append((original_class, features))
        node_index = self.decision_tree.navigate(features)
        self.node2indexes[node_index].append(index)
        self.update_histogram_in_tree(node_index, original_class, features)

    def get_histogram(self, node_index, feature_index, class_key):
        if node_index not in self.histograms:
            self.histograms[node_index] = dict()
        if feature_index not in self.histograms[node_index]:
            self.histograms[node_index][feature_index] = dict()
        if class_key not in self.histograms[node_index][feature_index]:
            self.histograms[node_index][feature_index][class_key] = Histogram(self.bins_count)
        return self.histograms[node_index][feature_index][class_key]

    def split_node(self, node_index, feature_index, split_threshold):
        self.clear_histograms(node_index)
        left_child, right_child = self.decision_tree.split_node(node_index, feature_index, split_threshold)
        for index in self.node2indexes[node_index]:
            class_key, features = self.data[index]
            if features[feature_index] < split_threshold:
                self.node2indexes[left_child].append(index)
                self.update_histogram_in_tree(left_child, class_key, features)
            else:
                self.node2indexes[right_child].append(index)
                self.update_histogram_in_tree(right_child, class_key, features)

    def update_histogram_in_tree(self, node_index, original_class, features):
        for feature_index in xrange(len(features)):
            self.get_histogram(node_index, feature_index, original_class).add(features[feature_index])

    def get_classes_from_node_index(self, node_index):
        result = defaultdict(int)
        for index in self.node2indexes[node_index]:
            result[self.data[index][0]] += 1
        return result

    def clear_histograms(self, node_index):
        if node_index in self.histograms:
            del(self.histograms[node_index])
Beispiel #39
0
 def setUp(self):
     self.tree = DecisionTree()
     self.data = [{"male": True, "tall": False, "rich": True, "married": False},
             {"male": False, "tall": True, "rich": True, "married": True},
             {"male": False, "tall": True, "rich": False, "married": True},
             {"male": True, "tall": True, "rich": True, "married": True},
             {"male": False, "tall": True, "rich": False, "married": True}]
 def test_split_data(self):
     new_sample, new_output, sub_feature_list = DecisionTree._split_data_set(
         self.sample_input, self.sample_output, 0, 1, self.feature_labels
     )
     np.testing.assert_array_equal(new_sample, np.array([[1], [1], [0]]))
     self.assertListEqual(new_output, ["yes", "yes", "no"])
     self.assertListEqual(sub_feature_list, ["flippers"])
Beispiel #41
0
def main():
    iris = load_iris()
    data_train, data_test, label_train, label_test = train_test_split(iris.data, iris.target)

    dt = DecisionTree()
    dt.fit(data_train, label_train)
    pred = dt.predict(data_test)
    #dt.print_tree()

    #print(iris)
    print(data_train)
    print(label_train)
    print(data_test)
    print(label_test)
    print(pred)
    print(confusion_matrix(label_test, pred))
    def learn(self, features_filepath):
        decision_tree = DecisionTree()
        current_node_index = decision_tree.get_next_non_terminal_node()
        node2indexes = defaultdict(list)
        data = []
        original_G = None

        _logger.debug("Reading data")
        index = 0
        features_count = None
        for current_object in ObjectReader().open(features_filepath):
            original_class = get_class_from_object(current_object)
            features = current_object.features
            if features_count is None:
                features_count = len(features)
            data.append((original_class, features))
            node2indexes[current_node_index].append(index)
            index += 1
        _logger.debug("End reading data")

        while decision_tree.get_next_non_terminal_node() is not None:
            current_node_index = decision_tree.get_next_non_terminal_node()
            data_indexes = node2indexes[current_node_index]
            _logger.debug("Current node index: " + str(current_node_index))
            _logger.debug("Indexes count:" + str(len(data_indexes)))

            current_G = self.impurity_function.calc(get_classes_count(data, data_indexes))
            _logger.debug("Current G: " + str(current_G))
            if original_G is None:
                original_G = current_G
            if (current_G < self.alpha * original_G) or (len(data_indexes) < self.min_object_in_node):
                _logger.debug("Stop in node")
                class_probabilities = dict()
                for index in data_indexes:
                    cls = data[index][0]
                    if cls not in class_probabilities:
                        class_probabilities[cls] = 0
                    class_probabilities[cls] += 1
                decision_tree.set_node_classification(current_node_index, class_probabilities)
                continue

            splits = []
            for feature_index in xrange(features_count):
                values = sorted([data[index][1][feature_index] for index in data_indexes], key=lambda x: float(x))
                for j in xrange(self.discretization):
                    splits.append((feature_index, values[j * len(values) / self.discretization]))

            max_delta = None
            best_feature_index = None
            best_split_threshold = None
            for feature_index, split_threshold in splits:
                left_indexes = []
                right_indexes = []
                for index in data_indexes:
                    if data[index][1][feature_index] < split_threshold:
                        left_indexes.append(index)
                    else:
                        right_indexes.append(index)
                left_G = self.impurity_function.calc(get_classes_count(data, left_indexes))
                right_G = self.impurity_function.calc(get_classes_count(data, right_indexes))
                tau = 1.0 * len(left_indexes) / len(data_indexes)
                delta = current_G - tau * left_G - (1 - tau) * right_G
                if (max_delta is None) or (delta > max_delta):
                    max_delta = delta
                    best_feature_index = feature_index
                    best_split_threshold = split_threshold
            _logger.debug("Best feature index: " + str(best_feature_index))
            _logger.debug("Best split threshold: " + str(best_split_threshold))
            _logger.debug("Max delta: " + str(max_delta))

            left_child, right_child = decision_tree.split_node(current_node_index, best_feature_index,
                                                               best_split_threshold)
            left_indexes = []
            right_indexes = []
            for index in data_indexes:
                if data[index][1][best_feature_index] < best_split_threshold:
                    left_indexes.append(index)
                else:
                    right_indexes.append(index)
            node2indexes[left_child] = left_indexes
            node2indexes[right_child] = right_indexes
        return decision_tree
 def test_calculate_shannon_entropy(self):
     h = DecisionTree._calculate_shannon_entropy(self.sample_output)
     self.assertAlmostEqual(h, 0.970951, places=5, msg="Shannon entropy should be 0.970951, but get: %f" % h)
Beispiel #44
0
# ..........................
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
# Rescale label for Adaboost to {-1, 1}
rescaled_y_train = 2*y_train - np.ones(np.shape(y_train))
rescaled_y_test = 2*y_test - np.ones(np.shape(y_test))

# .......
#  SETUP
# .......
adaboost = Adaboost(n_clf = 8)
naive_bayes = NaiveBayes()
knn = KNN(k=4)
logistic_regression = LogisticRegression()
mlp = MultilayerPerceptron(n_hidden=20)
perceptron = Perceptron()
decision_tree = DecisionTree()
random_forest = RandomForest(n_estimators=150)
support_vector_machine = SupportVectorMachine(C=1, kernel=rbf_kernel)

# ........
#  TRAIN
# ........
print "Training:"
print "\tAdaboost"
adaboost.fit(X_train, rescaled_y_train)
print "\tNaive Bayes"
naive_bayes.fit(X_train, y_train)
print "\tLogistic Regression"
logistic_regression.fit(X_train, y_train)
print "\tMultilayer Perceptron"
mlp.fit(X_train, y_train, n_iterations=20000, learning_rate=0.1)
Beispiel #45
0
#!coding: utf-8
"""
This is a demo implement for <<统计学习方法>>.5.3 ID3 example
"""
import numpy as np
from decision_tree import DecisionTree


if __name__ == "__main__":
    dat = np.loadtxt("./dat.csv", dtype="i", delimiter=",",
                     usecols=range(1, 5))
    # AGE,WORK,HOUSE,PASS
    dt = DecisionTree(column_names=("age", "work", "house", "pass"))
    tree = dt.training(dat)
    tree_dict = tree.to_dict()["children"]["root"]
    from utils import tree_plot
    from matplotlib import pyplot as plt
    print tree_dict
    tree_plot.show_tree(plt, tree_dict)
    plt.show()
Beispiel #46
0
def q14():
    tree = DecisionTree()
    tree.fit(*load_train())
    print tree.ein
 def __init__(self, bins_count=30):
     self.data = list()
     self.node2indexes = defaultdict(list)
     self.decision_tree = DecisionTree()
     self.histograms = dict()
     self.bins_count = bins_count
Beispiel #48
0
def q15():
    tree = DecisionTree()
    tree.fit(*load_train())
    print tree.error(*load_test())
    def learn(self, features_filepath):
        start_time = time.time()
        k1 = 0
        block1_time = 0
        k2 = 0
        block2_time = 0

        features_used = dict()


        decision_tree = DecisionTree()
        workers = [SPDRWorker(self.worker_bins_count) for _ in xrange(self.workers_count)]
        original_G = None
        features_count = None
        classes = set()

        _logger.debug("Reading data")
        index = 0
        for current_object in ObjectReader().open(features_filepath):
            original_class = get_class_from_object(current_object)
            features = current_object.features
            if features_count is None:
                features_count = len(features)
            classes.add(original_class)
            workers[index].add_object(original_class, features)
            index = (index + 1) % self.workers_count
        _logger.debug("End reading data")

        while decision_tree.get_next_non_terminal_node() is not None:
            current_node_index = decision_tree.get_next_non_terminal_node()
            _logger.debug("Current node: " + str(current_node_index))

            histograms = []
            feature_histogram = []
            for feature_index in xrange(features_count):
                histograms.append(dict())
                feature_histogram.append(Histogram(self.worker_bins_count))
                for class_key in classes:
                    if class_key not in histograms[feature_index]:
                        histograms[feature_index][class_key] = Histogram(self.worker_bins_count)
                    for worker in workers:
                        histograms[feature_index][class_key].merge(worker.get_histogram(current_node_index,
                                                                                        feature_index, class_key))
                    feature_histogram[feature_index].merge(histograms[feature_index][class_key])

            classes_in_node_index = defaultdict(int)
            total_elements = 0
            for class_key in classes:
                classes_in_node_index[class_key] = histograms[0][class_key].get_total_elements()
                total_elements += histograms[0][class_key].get_total_elements()
            _logger.debug("Total elements: " + str(total_elements))

            k1 += 1
            cur_time = time.time()
            for worker in workers:
                worker.clear_histograms(current_node_index)
            block1_time += time.time() - cur_time

            decision_tree.set_node_classification(current_node_index, classes_in_node_index)
            if not total_elements:
                raise BaseException("F**k!")

            current_G = self.impurity_function.calc(classes_in_node_index)
            current_R = self.regularization(classes_in_node_index)
            _logger.debug("Impurity: " + str(current_G))
            _logger.debug("Regularization: " + str(current_R))
            if original_G is None:
                original_G = current_G
            if (current_G < self.alpha * original_G) or (total_elements < self.min_object_in_node):
                _logger.debug("Stop in node, total elements: " + str(total_elements))
                continue

            splits = []
            for feature_index in xrange(features_count):
                min_value, max_value = feature_histogram[feature_index].get_min_max_elements()
                values = feature_histogram[feature_index].uniform(self.discretization + 1)
                for value in values:
                    if (min_value < value) and (value < max_value):
                        splits.append((feature_index, value))
            max_delta = None
            best_feature_index = None
            best_split_threshold = None
            for feature_index, split_threshold in splits:
                tau = feature_histogram[feature_index].sum(split_threshold) / feature_histogram[feature_index].get_total_elements()
                classes_in_left = dict()
                classes_in_right = dict()
                for class_key in histograms[feature_index]:
                    classes_in_left[class_key] = 0
                    classes_in_right[class_key] = histograms[feature_index][class_key].get_total_elements()
                    if histograms[feature_index][class_key].get_total_elements() > 0:
                        classes_in_left[class_key] = histograms[feature_index][class_key].sum(split_threshold)
                        classes_in_right[class_key] -= classes_in_left[class_key]
                left_R = self.regularization(classes_in_left)
                right_R = self.regularization(classes_in_right)
                delta = current_G - tau * (self.impurity_function.calc(classes_in_left) + left_R) - (1 - tau) * (self.impurity_function.calc(classes_in_right) + right_R)
                if (max_delta is None) or (max_delta < delta):
                    max_delta = delta
                    best_feature_index = feature_index
                    best_split_threshold = split_threshold
            _logger.debug("Best feature index: " + str(best_feature_index))
            _logger.debug("Best split threshold: " + str(best_split_threshold))
            _logger.debug("Max delta: " + str(max_delta))

            if (max_delta is not None) or (max_delta > 0):
                decision_tree.split_node(current_node_index, best_feature_index, best_split_threshold)
                if best_feature_index not in features_used:
                    features_used[best_feature_index] = 0
                features_used[best_feature_index] += 1

                k2 += 1
                cur_time = time.time()
                for worker in workers:
                    worker.split_node(current_node_index, best_feature_index, best_split_threshold)
                block2_time += time.time() - cur_time

        _logger.info(features_used)

        total_time = time.time() - start_time
        _logger.info("Total time: " + str(total_time))
        _logger.info("k1: " + str(k1))
        _logger.info("block1 time: " + str(block1_time))
        _logger.info("k2: " + str(k2))
        _logger.info("block2 time: " + str(block2_time))
        parallel_time = (total_time - block1_time - block2_time) + (block1_time + block2_time) / self.workers_count
        _logger.info("Parallel time: " + str(parallel_time))
        _logger.info("Acceleration: " + str(total_time / parallel_time))
        _logger.info("Efficiency: " + str(total_time / parallel_time / self.workers_count))
        _logger.info("Workers count: " + str(self.workers_count))
        _logger.info("Bins count: " + str(self.worker_bins_count))
        if self.info_filepath is not None:
            print >> self.info_filepath, "\t".join(map(str, [total_time, k1, block1_time, k2, block2_time,
                                                             parallel_time,
                                                             total_time / parallel_time,
                                                             total_time / parallel_time / self.workers_count,
                                                             self.workers_count, self.worker_bins_count]))

        return decision_tree
Beispiel #50
0
def q13():
    tree = DecisionTree()
    tree.fit(*load_train())
    print tree.__prepr__()
    print tree.node_count
def evaluate_performance():
    '''
    Evaluate the performance of decision trees and logistic regression,
    average over 1,000 trials of 10-fold cross validation

    Return:
      a matrix giving the performance that will contain the following entries:
      stats[0,0] = mean accuracy of decision tree
      stats[0,1] = std deviation of decision tree accuracy
      stats[1,0] = mean accuracy of logistic regression
      stats[1,1] = std deviation of logistic regression accuracy

    ** Note that your implementation must follow this API**
    '''

    # Load Data
    filename = 'data/SPECTF.dat'
    data = np.loadtxt(filename, delimiter=',')
    X = data[:, 1:]
    y = np.array([data[:, 0]]).T
    n, d = X.shape

    for trial in range(1000):
        # TODO: shuffle for each of the trials.
        # the following code is for reference only.
        idx = np.arange(n)
        np.random.seed(13)
        np.random.shuffle(idx)
        X = X[idx]
        y = y[idx]

        # TODO: write your own code to split data (for cross validation)
        # the code here is for your reference.
        Xtrain = X[1:101, :]  # train on first 100 instances
        Xtest = X[101:, :]
        ytrain = y[1:101, :]  # test on remaining instances
        ytest = y[101:, :]

        # train the decision tree
        classifier = DecisionTree(100)
        classifier.fit(Xtrain, ytrain)

        # output predictions on the remaining data
        y_pred = classifier.predict(Xtest)
        accuracy = accuracy_score(ytest, y_pred)
        break

    # compute the training accuracy of the model
    meanDecisionTreeAccuracy = np.mean(all_accuracies)

    # TODO: update these statistics based on the results of your experiment
    stddevDecisionTreeAccuracy = 0
    meanLogisticRegressionAccuracy = 0
    stddevLogisticRegressionAccuracy = 0
    meanRandomForestAccuracy = 0
    stddevRandomForestAccuracy = 0

    # make certain that the return value matches the API specification
    stats = np.zeros((2, 2))
    stats[0, 0] = meanDecisionTreeAccuracy
    stats[0, 1] = stddevDecisionTreeAccuracy
    stats[1, 0] = meanRandomForestAccuracy
    stats[1, 1] = stddevRandomForestAccuracy
    stats[2, 0] = meanLogisticRegressionAccuracy
    stats[2, 1] = stddevLogisticRegressionAccuracy
    return stats
 def test_choose_feature_to_split(self):
     decision_tree = DecisionTree()
     feature_to_split = decision_tree._select_feature_to_split(self.sample_input, self.sample_output)
     self.assertEqual(feature_to_split, 0, "The best feature index to pick is 0, but get %d" % feature_to_split)
 def test_train_method(self):
     decision_tree = DecisionTree()
     decision_tree.train(self.sample_input, self.sample_output, feature_label=self.feature_labels)
     self.assertIsNotNone(decision_tree.root, "Decision tree must have a root node")