Beispiel #1
0
def test():

    data_train = pd.read_csv('./data_set/iris_1_3.csv', header=0)
    train_data = np.array(data_train ,'float')

    #
    X = train_data[:,:-1]
    y = train_data[:, -1]

    X_train, X_test, y_train, y_true = train_test_split(X, y,test_size=1 / 3., random_state= 6)

    d_tree = DecisionTreeClassifier(criterion = 'gini')

    #使用有放回抽样,抽样数据进行训练树,包外数据进行验证

    X_subset, y_subset, out_of_bag_data = sampling_bagging(X_train,y_train)

    d_tree.fit(X_subset, y_subset)

    #使用袋外数据进行树的调整;

    print('y_true : ', y_true.tolist())
    pre_lab  = d_tree.predict(X_test)
    print('pre_lab: ',pre_lab.tolist())
    # print('test_data\n',np.column_stack((X_test,y_true)))
    print('The accuracy was ',100 * accuracy_score(y_true,pre_lab),'% on the test ')
Beispiel #2
0
    def test_fit(self):
        clf = DecisionTreeClassifier()
        clf.fit(self.X_train, self.y_train, self.feature_names)
        # verify the decision tree looks like this
        #
        #                        feature:
        #                        outlook
        #                         / | \
        #                       /   |   \
        #             rainy   /  overcast \   sunny
        #                   /       |       \
        #                 /         |         \
        #            feature:     class:     feature:
        #            windy        True       humidity
        #            /   \                    /   \
        #   False  /       \  True     high /       \  normal
        #        /           \            /           \
        #      class:      class:      class:        class:
        #      True        False       False         True

        assert clf.root.feature == 'outlook'
        rainy_node = clf.root.children_by_attribute['rainy']
        overcast_node = clf.root.children_by_attribute['overcast']
        sunny_node = clf.root.children_by_attribute['sunny']
        assert rainy_node.feature == 'windy'
        assert overcast_node.classification == True
        assert sunny_node.feature == 'humidity'
        assert rainy_node.children_by_attribute['False'].classification == True
        assert rainy_node.children_by_attribute['True'].classification == False
        assert sunny_node.children_by_attribute['high'].classification == False
        assert sunny_node.children_by_attribute[
            'normal'].classification == True
Beispiel #3
0
def main():
    curr_dir = os.path.dirname(__file__)
    csv_file = os.path.join(curr_dir, 'data/play.csv')

    df = pd.read_csv(csv_file, index_col='Dia')
    X, y = df.loc[:, df.columns != 'Jogar'], df['Jogar']

    clf = DecisionTreeClassifier()
    clf.fit(X, y)
    print(clf.rules())
    def test_decision_tree_classifier_numerical_split(self):
        model = DecisionTreeClassifier()

        # feature 3, 1 -> label 1
        # feature 2, 0 -> label 0
        features = np.array([[3, 0], [3, 0], [3, 0], [2, 0], [2, 0], [2, 0],
                             [1, 0], [1, 0], [1, 0], [0, 0], [0, 0]])
        labels = np.array([1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0])

        model.fit(features, labels)
        """
    def test_decision_tree_classifier_exhaustive_categorical_split(self):
        model = DecisionTreeClassifier(categorical_feature_indeces=[0, 1])

        # feature[0] 3, 1 -> label 1
        # feature[0] 2, 0 -> label 0, 2
        features = np.array([[3, 0], [3, 0], [3, 0], [2, 0], [2, 0], [2, 0],
                             [1, 0], [1, 0], [1, 0], [0, 0], [0, 1]])
        labels = np.array([1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 2])

        model.fit(features, labels)
        """
 def __generate_trees(self, df, n_baggings):
     '''
     Given 'n' baggings, generate one Tree model for each of them
     '''
     for i in n_baggings:
         clf = DecisionTreeClassifier(
             self.target_attribute,
             self.n_random_attributes
         )
         clf.fit(df.iloc[n_baggings[i],:])
         self.trees[i] = clf
Beispiel #7
0
def result():
    payload = request.get_json()
    diagnosis_data = DiagnosisRequest(payload)
    x_test = diagnosis_data.to_np_array()

    tree_model = DecisionTree.query.order_by(DecisionTree.id.desc()).first()
    decision_tree = tree_model.tree
    tree_classifier = DecisionTreeClassifier(initial_tree=decision_tree)

    prediction = tree_classifier.predict(x_test)
    response = {'result': prediction.tolist()[0]}
    return jsonify(response)
Beispiel #8
0
 def test_predict(self):
     clf = DecisionTreeClassifier()
     clf.fit(self.X_train, self.y_train, self.feature_names)
     expected_for_x = [
         (np.array(['sunny', 'hot', 'high', False]),
          False),  # sunny outlook + high humidity -> don't play
         (np.array(['sunny', 'hot', 'normal', False]),
          True),  # sunny outlook + normal humidity -> play
         (np.array(['overcast', 'hot', 'high',
                    False]), True),  # overcast outlook -> don't play
     ]
     for x, expected in expected_for_x:
         output = clf.predict(x)
         assert output == expected
Beispiel #9
0
    def fit(self, X, y):
        self.forest = []
        N = len(y)
        N_sub_data = int(N * self.bootstrap)

        for i in range(self.trees_num):
            self.shuffle(X, y)
            X_sub = X[:N_sub_data]
            y_sub = y[:N_sub_data]

            decision_tree = DecisionTreeClassifier(self.features_num,
                                                   self.max_depth)
            decision_tree.fit(X_sub, y_sub)
            # 得られた決定木をforestのリストに追加
            self.forest.append(decision_tree)
    def test_decision_tree_classifier_numerical_split_hard(self):
        model = DecisionTreeClassifier()

        # feature 3, 1 -> label 1
        # feature 2, 0 -> label 0
        features = np.array([[0, 0], [0, 0], [1, 0], [1, 0], [2, 0], [2, 0], [3, 0], [3, 0], [4, 0], [4, 0], [5, 0], [5, 0], \
            [6, 0], [6, 0], [7, 0], [7, 0], [8, 0], [8, 0], [9, 0], [9, 0], [10, 0], [10, 0], [11, 0], [11, 0]])
        labels = np.array([
            1,
            1,
            0,
            0,
            1,
            1,
            0,
            0,
            1,
            1,
            0,
            0,
            1,
            1,
            0,
            0,
            1,
            1,
            0,
            0,
            1,
            1,
            0,
            0,
        ])

        model.fit(features, labels)
        """
        print("test_decision_tree_classifier_numerical_split_hard")
        from pprint import pprint
        pprint(model._node)
        """

        predictions = model.predict(
            np.array([[0, 0], [1, 0], [2, 0], [3, 0], [4, 0], [5, 0], [6, 0],
                      [7, 0], [8, 0], [9, 0], [10, 0], [11, 0]]))

        self.assertEqual(predictions.tolist(),
                         [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0])
Beispiel #11
0
def visualize(model, max_depth=None):
    iris_dataset = datasets.load_iris()
    petal_features = iris_dataset['data'][:, 2:]
    targets = iris_dataset['target']

    if max_depth is None:
        # 決定木の最大深度は制限しない
        # アヤメのデータセットの場合は、データ数やクラス数が少ないため、深度を制限しなくても計算時間はあまりかからない
        if model == 'decision_tree':
            clf = DecisionTreeClassifier()
        else:
            clf = RandomForestClassifier()
    else:
        if model == 'decision_tree':
            clf = DecisionTreeClassifier(max_depth=max_depth)
        else:
            clf = RandomForestClassifier(max_depth=max_depth)

    clf.fit(petal_features, targets)

    # データの取りうる範囲 +-1 を計算する
    x_min = max(0, petal_features[:, 0].min() - 1)
    y_min = max(0, petal_features[:, 1].min() - 1)
    x_max = petal_features[:, 0].max() + 1
    y_max = petal_features[:, 1].max() + 1

    # 教師データの取りうる範囲でメッシュ状の座標を作る
    grid_interval = 0.2
    xx, yy = np.meshgrid(np.arange(x_min, x_max, grid_interval),
                         np.arange(y_min, y_max, grid_interval))

    # メッシュの座標を学習したモデルで判定させる
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    # 各点の判定結果をグラフに描画する
    plt.contourf(xx, yy, Z.reshape(xx.shape), cmap=plt.cm.rainbow, alpha=0.4)

    # データもプロット
    for c in np.unique(targets):
        plt.scatter(petal_features[targets == c, 0],
                    petal_features[targets == c, 1])

    feature_names = iris_dataset['feature_names']
    plt.xlabel(feature_names[2])
    plt.ylabel(feature_names[3])
    if max_depth is None:
        plt.title('Max Depth : No Limitation')
        plt.savefig('figures/iris/{}_no_limit.png'.format(model))
    else:
        plt.title('Max Depth : ' + str(max_depth))
        plt.savefig('figures/iris/{}_depth_{}.png'.format(model, max_depth))

    plt.close()
    def _compare_sklearn_dataset(self, dataset):
        dataset = load_iris()

        features = dataset.data
        labels = dataset.target

        model_sklearn = DecisionTreeClassifierSklearn()

        model_sklearn.fit(features, labels)
        predictions_sklearn = model_sklearn.predict(features)

        model = DecisionTreeClassifier()

        model.fit(features, labels)
        predictions = model.predict(features)

        self.assertEqual(predictions.tolist(), predictions_sklearn.tolist())
    def test_decision_tree_classifier_fit(self):
        model = DecisionTreeClassifier()

        # XOR problem
        features = np.array([[0, 0], [0, 0], [0, 1], [0, 1], [0, 1], [1, 0],
                             [1, 0], [1, 0], [1, 1], [1, 1], [1, 1]])
        labels = np.array([0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0])

        model.fit(features, labels)
        """
        from pprint import pprint
        pprint(model._node)
        """

        predictions = model.predict(np.array([[0, 0], [0, 1], [1, 0], [1, 1]]))

        self.assertEqual(predictions.tolist(), [0, 1, 1, 0])
Beispiel #14
0
def compare_performance(trees_num, max_depth, bootstrap):

    dataset = datasets.load_digits()
    X_train, X_test, y_train, y_test = train_test_split(dataset['data'], dataset['target'], test_size=0.3,  random_state=0)

    # 決定木
    print('##### 決定木の性能  #####')
    decision_tree = DecisionTreeClassifier(max_depth=max_depth)
    dt_lr_start = time.time()  # 学習開始時間を記録
    decision_tree.fit(X_train, y_train)
    dt_lr_time = time.time() - dt_lr_start  # 学習時間
    dt_est_start = time.time()  # 推論開始時間を記録
    y_est = decision_tree.predict(X_test)
    dt_est_time = time.time() - dt_est_start  # 推論時間
    print('学習時間 : {:.6f} [sec]     推論時間 : {:.6f} [sec]'.format(dt_lr_time, dt_est_time))
    dt_train_accuracy = decision_tree.accuracy_score(X_train, y_train)
    dt_test_accuracy = decision_tree.accuracy_score(X_test, y_test)
    print('train accuracy : {:.4f}     test_accuracy : {:.4f}'.format(dt_train_accuracy, dt_test_accuracy))

    # ランダムフォレスト
    print('##### ランダムフォレストの性能 #####')
    random_forest = RandomForestClassifier(trees_num=trees_num, max_depth=max_depth, bootstrap=bootstrap)
    rf_lr_start = time.time()  # 学習開始時間を記録
    random_forest.fit(X_train, y_train)
    rf_lr_time = time.time() - rf_lr_start  # 学習時間
    rf_est_start = time.time()  # 推論開始時間を記録
    y_est = random_forest.predict(X_test)
    rf_est_time = time.time() - rf_est_start  # 推論時間
    print('学習時間 : {:.6f} [sec]     推論時間 : {:.6f} [sec]'.format(rf_lr_time, rf_est_time))
    rf_train_accuracy = random_forest.accuracy_score(X_train, y_train)
    rf_test_accuracy = random_forest.accuracy_score(X_test, y_test)
    print('train accuracy : {:.4f}     test_accuracy : {:.4f}'.format(rf_train_accuracy, rf_test_accuracy))
Beispiel #15
0
def main():
    args = get_cmd_ln_arguments()

    num_columns, num_rows, training_data = parse_txt_file(args.filename)
    feature_names = get_feature_names(num_columns)

    decision_tree = DecisionTreeClassifier()
    print('Training a decision tree classifier...')
    decision_tree.fit(feature_names, training_data)
    print('Decision tree classifier trained:')
    decision_tree.print()

    print()
    print(
        'Entering a loop to query the decision tree. Press ctrl-c at anytime to exit.'
    )

    while True:
        sample = input(
            'Enter a sample ({} numbers separated by a space): '.format(
                num_columns))
        try:
            sample = line_to_int_list(sample)
        except ValueError:
            print(
                'Input was not {} numbers separated by a space. Please try again. '
                .format(num_columns))
            continue
        prediction = decision_tree.predict(sample)
        print('Prediction: {}'.format(prediction))
Beispiel #16
0
def createTree(data, maximumDepth, currentDepth, tree, m):
    print("At depth: {}".format(currentDepth))
    # starttime = time.time()

    if currentDepth == maximumDepth:
        # If you have reached maximum depth, store the prediction based on number of positive and negative examples
        label = calcLabel(data[:, 0])
        tree.insert(None, None, True, label)
        return

    u_root = giniIndex(data[:, 0])

    gain = 0
    threshold = 0
    best_feature = 0
    i = 0
    for featureIndex in sample(range(1, data.shape[1]), m):
        print("Calculating Gain for Feature Number:{}".format(i))
        currentGain, currentThreshold = getInfoGain(data[:, 0],
                                                    data[:,
                                                         featureIndex], u_root)
        if currentGain > gain:
            gain = currentGain
            threshold = currentThreshold
            best_feature = featureIndex
        i = i + 1

    if gain == 0:
        label = calcLabel(data[:, 0])
        tree.insert(None, None, True, label)
        return

    trueExamples = data[data[:, best_feature] >= threshold]
    falseExamples = data[data[:, best_feature] < threshold]

    label = calcLabel(data[:, 0])
    tree.insert(best_feature, threshold, False, label)
    tree.left = DecisionTreeClassifier()
    tree.right = DecisionTreeClassifier()
    currentDepth = currentDepth + 1
    # print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~Time for depth: {} = {}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~".format(currentDepth, time.time() - starttime))
    createTree(trueExamples, maximumDepth, currentDepth, tree.left, m)
    createTree(falseExamples, maximumDepth, currentDepth, tree.right, m)
def createTree_adaboost(data, maximumDepth, currentDepth, tree):
    print("At depth: {}".format(currentDepth))
    # starttime = time.time()

    if currentDepth == maximumDepth:
        label = calcLabel(data[:, 0:2])
        tree.insert(None, None, True, label)
        return

    u_root = giniIndex(data[:, 0:2])

    gain = 0
    threshold = 0
    best_feature = 0
    for featureIndex in range(2, data.shape[1]):
        print("Calculating Gain for Feature Number: {}".format(featureIndex -
                                                               2))
        currentGain, currentThreshold = getInfoGain(data[:, 0:2],
                                                    data[:,
                                                         featureIndex], u_root)
        if currentGain > gain:
            gain = currentGain
            threshold = currentThreshold
            best_feature = featureIndex

    if gain == 0:
        label = calcLabel(data[:, 0:2])
        tree.insert(None, None, True, label)
        return

    trueExamples = data[data[:, best_feature] >= threshold]
    falseExamples = data[data[:, best_feature] < threshold]

    label = calcLabel(data[:, 0:2])
    tree.insert(best_feature, threshold, False, label)
    tree.left = DecisionTreeClassifier()
    tree.right = DecisionTreeClassifier()
    currentDepth = currentDepth + 1
    # print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~Time for depth: {} = {}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~".format(currentDepth, time.time() - starttime))
    createTree_adaboost(trueExamples, maximumDepth, currentDepth, tree.left)
    createTree_adaboost(falseExamples, maximumDepth, currentDepth, tree.right)
Beispiel #18
0
    def fit(self, X, y):

        # print('===', sys._getframe().f_code.co_filename, sys._getframe().f_code.co_name, sys._getframe().f_lineno,"===")
        self.forest = []
        self.n_calss_num = len(set(y))  #有几个类
        self.n_calss = list(set(y))  #类标签集合

        for i in range(self.n_estimators):

            #随机的取数据 self.bootstrap 比率 ,表示抽取样本集的比例
            X_subset, y_subset = sampling_with_reset(X, y, self.bootstrap)
            ###########################################
            tree = DecisionTreeClassifier(self.max_features, self.criterion,
                                          self.max_depth,
                                          self.min_samples_split,
                                          self.min_impurity_split)
            #打印树的信息
            print('tree_' + str(i))
            tree.fit(X_subset, y_subset)

            self.forest.append(tree)  #树的集合
Beispiel #19
0
def test_benchmark_numerical():
    print('\n** Numerical benchmark **')
    df = pd.read_csv(DATA_PATH + 'dados_benchmark_v2.csv', sep=';')
    dt = DecisionTreeClassifier(
        target_attribute = 'Joga',
        n_random_attributes=4
    )
    dt.fit(df)
    dt.print_tree()
Beispiel #20
0
def compare_depth():

    dataset = datasets.load_digits()
    X_train, X_test, y_train, y_test = train_test_split(dataset['data'], dataset['target'], test_size=0.3,  random_state=0)

    # ランダムフォレストに関して、最良のハイパーパラメータを調べる
    trees_num, bootstrap = grid_search_RF()

    # 決定木の深度の制限を変えて、調べる
    depth_list = [i for i in range(21)]  # 深さの制限0~20まで調べる

    dt_train_acc_list = []
    dt_test_acc_list = []
    rf_train_acc_list = []
    rf_test_acc_list = []

    for depth in tqdm(depth_list):
        print('***** max_depth = {} *****'.format(depth))
        # 決定木
        decision_tree = DecisionTreeClassifier(max_depth=depth)
        decision_tree.fit(X_train, y_train)
        dt_train_accuracy = decision_tree.accuracy_score(X_train, y_train)
        dt_test_accuracy = decision_tree.accuracy_score(X_test, y_test)
        # accuracyをリストに追加
        dt_train_acc_list.append(dt_train_accuracy)
        dt_test_acc_list.append(dt_test_accuracy)
        print('決定木       train accuracy : {:.4f}     test_accuracy : {:.4f}'.format(dt_train_accuracy, dt_test_accuracy))

        # ランダムフォレスト
        random_forest = RandomForestClassifier(trees_num=trees_num, max_depth=depth, bootstrap=bootstrap)
        random_forest.fit(X_train, y_train)
        rf_train_accuracy = random_forest.accuracy_score(X_train, y_train)
        rf_test_accuracy = random_forest.accuracy_score(X_test, y_test)
        # accuracyをリストに追加
        rf_train_acc_list.append(rf_train_accuracy)
        rf_test_acc_list.append(rf_test_accuracy)
        print('ランダムフォレスト       train accuracy : {:.4f}     test_accuracy : {:.4f}'.format(rf_train_accuracy, rf_test_accuracy))

    # グラフの描画
    plt.plot(depth_list, dt_train_acc_list, label='Decision Tree - train accuracy', color='r')
    plt.plot(depth_list, dt_test_acc_list, label='Decision Tree - test accuracy', color='g')
    plt.plot(depth_list, rf_train_acc_list, label='Random Forest - train accuracy', color='y')
    plt.plot(depth_list, rf_test_acc_list, label='Random Forest - test accuracy', color='b')

    plt.xlabel('Max Depth')
    plt.ylabel('Accuracy')
    plt.xlim(0, 20)
    plt.xticks(np.arange(0, 21, 2))
    plt.ylim(0, 1.0)
    plt.legend(loc='lower right')
    plt.title('Max Depth of Decision Trees and Accuracy')
    # グラフを保存
    plt.savefig('figures/mnist/max_depth_&_accuracy.png')
Beispiel #21
0
def main():

    dataset = datasets.load_iris()
    X_train, X_test, y_train, y_test = train_test_split(dataset['data'],
                                                        dataset['target'],
                                                        test_size=0.3,
                                                        random_state=0)

    # 決定木の深度の制限が1~3、制限なしの各場合について調べる
    depth_list = [1, 2, 3, None]
    for depth in depth_list:
        print('######### max_depth = {} #########'.format(depth))
        # 全ての特徴量を使用したときの精度、学習時間、推論時間、汎化性能を調べる
        # 決定木
        decision_tree = DecisionTreeClassifier(max_depth=depth)
        dt_lr_start = time.time()  # 学習開始時間を記録
        decision_tree.fit(X_train, y_train)
        dt_lr_time = time.time() - dt_lr_start  # 学習時間
        dt_est_start = time.time()  # 推論開始時間を記録
        y_est = decision_tree.predict(X_test)
        dt_est_time = time.time() - dt_est_start  # 推論時間
        print('決定木       学習時間 : {:.6f} [sec]     推論時間 : {:.6f} [sec]'.format(
            dt_lr_time, dt_est_time))
        dt_train_accuracy = decision_tree.accuracy_score(X_train, y_train)
        dt_test_accuracy = decision_tree.accuracy_score(X_test, y_test)
        print('決定木       train accuracy : {:.4f}     test_accuracy : {:.4f}'.
              format(dt_train_accuracy, dt_test_accuracy))

        # ランダムフォレスト
        random_forest = RandomForestClassifier(max_depth=depth)
        rf_lr_start = time.time()  # 学習開始時間を記録
        random_forest.fit(X_train, y_train)
        rf_lr_time = time.time() - rf_lr_start  # 学習時間
        rf_est_start = time.time()  # 推論開始時間を記録
        y_est = random_forest.predict(X_test)
        rf_est_time = time.time() - rf_est_start  # 推論時間
        print('ランダムフォレスト       学習時間 : {:.6f} [sec]     推論時間 : {:.6f} [sec]'.
              format(rf_lr_time, rf_est_time))
        rf_train_accuracy = random_forest.accuracy_score(X_train, y_train)
        rf_test_accuracy = random_forest.accuracy_score(X_test, y_test)
        print(
            'ランダムフォレスト       train accuracy : {:.4f}     test_accuracy : {:.4f}'
            .format(rf_train_accuracy, rf_test_accuracy))

        # 使用する特徴量を2つ(Petal legth, Petal width)に絞って、二次元で可視化
        visualize('decision_tree', max_depth=depth)
        visualize('random_forest', max_depth=depth)
def createForest(m, n, d, data):
	# m: Number of features for a tree; n: Number of trees; d: Depth of each tree
	print("Number of features, m = {}".format(m))
	print("Number of trees, n = {}".format(n))
	print("Maximum Depth, d = {}".format(d))
	length = len(data)
	list_of_trees = []
	
	for i in range(n):
		print("Tree {}".format(i))
		list_of_trees.append(DecisionTreeClassifier())
		sample = np.random.choice(length, length, replace = True)
		sample = data[sample]
		# starttime = time.time()
		createTree(sample, d, 0, list_of_trees[i], m)
		# print("Total Training time {}".format(time.time() - starttime))
		

	return list_of_trees
def adaboost(data, l, maximumDepth):
    size = len(data)
    D = np.empty(
        size)  # D is the Distribution Matrix (Matrix containing the weights)
    D.fill(1.0 / size)

    data = np.insert(data, 1, D,
                     axis=1)  # Insert D as column indexed at 1 in data

    tree_list = []
    alpha_list = []

    for weakLearner in range(l):
        tree = DecisionTreeClassifier()
        print("Learner No: {}".format(weakLearner))
        createTree_adaboost(data, maximumDepth, 0, tree)
        err, weightChange_list = errorCalc(tree, data, maximumDepth)
        alpha = (np.log(((1 - err) * 1.0) / err)) / 2
        data[:, 1] = data[:, 1] * np.exp(alpha * np.array(weightChange_list))
        tree_list.append(tree)
        alpha_list.append(alpha)

    return tree_list, alpha_list
Beispiel #24
0
 def test_calculate_entropy(self):
     clf = DecisionTreeClassifier()
     all_positive_class = np.array([True, True])
     assert clf._calculate_entropy(all_positive_class) == 0.0
     fifty_fifty_mix = np.array([True, False])
     assert clf._calculate_entropy(fifty_fifty_mix) == 1.0
 def _get_base_estimator(self, **kwargs):
     return DecisionTreeClassifier(**kwargs)
from sklearn import datasets, cross_validation

from decision_tree import DecisionTreeClassifier


iris = datasets.load_iris()
X, Y = iris.data, iris.target
clf = DecisionTreeClassifier()
clf.fit(X, Y)
print cross_validation.cross_val_score(clf, X, Y)
clf.draw_tree('decision_tree_example.png')
Beispiel #27
0
        {'aspectOfHand': 'palmar'})

    print('Getting unlabelled image features from Phase 1')
    unlabelled_features = helper_functions.get_main_features(
        label_feature_name, unlabelled_dataset_path)

    dorsal_features = {}
    palmar_features = {}

    for image in dorsal_images_list:
        dorsal_features[image] = label_folder_features[image]
    for image in palmar_images_list:
        palmar_features[image] = label_folder_features[image]

    if classifier == 'DT':
        decisiontree = DecisionTreeClassifier(max_depth=100)
        dorsal_images = list(dorsal_features.keys())
        palmar_images = list(palmar_features.keys())
        image_list = dorsal_images
        image_list.extend(palmar_images)
        random.shuffle(image_list)
        X = []
        y = [0] * len(image_list)

        for i in range(0, len(image_list)):
            image = image_list[i]
            if image in dorsal_features:
                y[i] = 0
            else:
                y[i] = 1
            X.append(label_folder_features[image])
Beispiel #28
0
from treeFunc import createTree, treeAccuracy
from decision_tree import DecisionTreeClassifier
from random import sample, randint
from randomForest import createForest, forestAccuracy
from adaboost import adaboost, treeAccuracy_ada

if __name__ == '__main__':
	trainData = prep.fileRead('pa3_train_reduced.csv')	# Read Training Examples
	trainData = prep.changeData(trainData)
	validData = prep.fileRead('pa3_valid_reduced.csv')	# Read Validation Data
	validData = prep.changeData(validData)
	
	##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~PART-1: DECISION TREE CLASSIFIER~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~##
	print("!!!!!Executing DECISION TREE CLASSIFIER!!!!!")
	maximum_Depth = 20
	tree = DecisionTreeClassifier()
	# starttime = time.time()
	createTree(trainData, maximum_Depth, 0, tree, 100)
	# print("Total Training time {}".format(time.time() - starttime))

	train_acc_list = []
	valid_acc_list = []
	itr_list = []
	for i in range(21):
		itr_list.append(i)
		train_acc_list.append(treeAccuracy(tree, trainData, i))
		valid_acc_list.append(treeAccuracy(tree, validData, i))

	# plt.scatter(itr_list, train_acc_list, color = 'blue', s = 15)
	# blue_line, = plt.plot(itr_list, train_acc_list, color = 'blue', label = 'Training Accuracy')
	# plt.title("ACCURACY vs DEPTH")
Beispiel #29
0
from sklearn.datasets import load_iris
from decision_tree import DecisionTreeClassifier
from sklearn import tree

# load the iris dataset
dataset = load_iris()

# set X and y variables
X, y = dataset.data, dataset.target
print(':::::::::::::::::::::::::::::::::::::::::::::')
print(f'APPROPRIATE X, y DATATYPES: {type(X)}')
print(':::::::::::::::::::::::::::::::::::::::::::::')
# create a new isntance of the DecisionTreeClassifier object
clf = DecisionTreeClassifier(max_depth=5)

# call the fit method on that object
clf.fit(X, y)
print('')
print(':::::::::::::PREDICTIONS:::::::::::::::::::::')
print('')
print(':::::::::::::::::::::::::::::::::::::::::::::')
inputs = [[1, 1.5, 5, 1.5]]
print(f'INPUTS: {inputs}')
print(f'OUR MODEL PREDICTION: {clf.predict(inputs)}')

clf2 = tree.DecisionTreeClassifier(max_depth=5)
clf2.fit(X, y)

print(f'SCIKITLEARN MODEL PREDICTION: {clf2.predict(inputs)}')
print(':::::::::::::::::::::::::::::::::::::::::::::')