# print('\nTesting root low low impurity')
# assert dt.root.low.low.impurity == 0
# print('passed')

# print('\nTesting root low high impurity')
# assert dt.root.low.high.impurity == 0
# print('passed')

print('Splitting Tests')
df = DataFrame.from_array(
    [[1, 11, 'A'], [1, 12, 'A'], [2, 11, 'A'], [1, 13, 'B'], [2, 13, 'B'],
     [3, 13, 'B'], [3, 11, 'B']],
    columns=['x', 'y', 'class'])

dt = DecisionTree(split_metric='gini')
dt.initialize(df)
dt.split()
dt.split()

assert dt.root.high.row_indices == [3, 4, 5]
assert dt.root.low.low.row_indices == [0, 1, 2]
assert dt.root.low.high.row_indices == [6]
print('passed')
dt = DecisionTree(split_metric='gini')
dt.fit(df)
assert dt.root.high.row_indices == [3, 4, 5]
assert dt.root.low.low.row_indices == [0, 1, 2]
assert dt.root.low.high.row_indices == [6]
print('passed')
Example #2
0
# ..........................
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
# Rescaled labels {-1, 1}
rescaled_y_train = 2 * y_train - np.ones(np.shape(y_train))
rescaled_y_test = 2 * y_test - np.ones(np.shape(y_test))

# .......
#  SETUP
# .......
adaboost = Adaboost(n_clf=8)
naive_bayes = NaiveBayes()
knn = KNN(k=4)
logistic_regression = LogisticRegression()
mlp = MultilayerPerceptron(n_hidden=20)
perceptron = Perceptron()
decision_tree = DecisionTree()
random_forest = RandomForest(n_estimators=150)
support_vector_machine = SupportVectorMachine(C=1, kernel=rbf_kernel)
lda = LDA()

# ........
#  TRAIN
# ........
print "Training:"
print "\tAdaboost"
adaboost.fit(X_train, rescaled_y_train)
print "\tNaive Bayes"
naive_bayes.fit(X_train, y_train)
print "\tLogistic Regression"
logistic_regression.fit(X_train, y_train)
print "\tLDA"
Example #3
0
def main():
    parser = ArgumentParser()
    parser.add_argument('--decision_tree', action='store_true')
    parser.add_argument('--knn', action='store_true')
    parser.add_argument('--knn_cv', action='store_true')
    parser.add_argument('--decision_tree_cv', action='store_true')
    parser.add_argument('--make_predictions', action='store_true')
    parser.add_argument('--add_index', action='store_true')
    args = parser.parse_args()

    if len(sys.argv) == 1:
        args.make_predictions = True

    x_train = pd.read_csv(os.path.join(DATA_DIR, 'x_train.csv'), header=None)
    x_test = pd.read_csv(os.path.join(DATA_DIR, 'x_test.csv'), header=None)
    y_train = pd.read_csv(os.path.join(DATA_DIR, 'y_train.csv'), header=None)
    y_train = y_train.T.squeeze()  # make it a Series

    # decision tree
    if args.decision_tree:
        print('running decision tree, max_depth=5 min_size=5')
        tree = DecisionTree(max_depth=5, min_size=5)
        tree.fit(x_train, y_train)
        y_pred = tree.predict(x_test)
        pd.Series(y_pred).to_csv(os.path.join(PREDICTIONS_DIR,
                                              'decision_tree_predictions.csv'),
                                 index=False,
                                 header=False)

    # knn
    if args.knn:
        print('running knn, n_neighbors=5')
        knn = KNN(n_neighbors=5)
        knn.fit(x_train, y_train)
        y_pred = knn.predict(x_test)
        pd.Series(y_pred).to_csv(os.path.join(PREDICTIONS_DIR,
                                              'knn_predictions.csv'),
                                 index=False,
                                 header=False)

    if args.knn_cv:
        for n_neighbors in [3, 5, 10, 20, 25]:
            knn = KNN(n_neighbors=n_neighbors)

            kfold = KFold(n_splits=5)
            metrics = []
            for fit_index, val_index in kfold.split(x_train, y_train):
                x_fit, y_fit = x_train.iloc[fit_index], y_train[fit_index]
                x_val, y_val = x_train.iloc[val_index], y_train[val_index]

                print(f'running knn cv, n_neighbors={n_neighbors}')
                knn.fit(x_fit, y_fit)
                y_pred_val = knn.predict(x_val)
                metrics.append(get_metrics(y_val, y_pred_val))

            a, p, r, f = zip(*metrics)
            mean_val_accuracy = np.mean(a)
            mean_val_precision = np.mean(p)
            mean_val_recall = np.mean(r)
            mean_val_f1 = np.mean(f)
            print(f'knn cv metrics for n_neighbors={n_neighbors}:')
            print('avg accuracy:', mean_val_accuracy)
            print('avg precision:', mean_val_precision)
            print('avg recall:', mean_val_recall)
            print('avg f1 score:', mean_val_f1)

    if args.decision_tree_cv:
        for max_depth in [3, 6, 9, 12, 15]:
            tree = DecisionTree(max_depth=max_depth, min_size=5)

            kfold = KFold(n_splits=5)
            metrics = []
            for fit_index, val_index in kfold.split(x_train, y_train):
                x_fit, y_fit = x_train.iloc[fit_index], y_train[fit_index]
                x_val, y_val = x_train.iloc[val_index], y_train[val_index]

                print(
                    f'running decision tree cv, max_depth={n_neighbors} min_size=5'
                )
                tree.fit(x_fit, y_fit)
                y_pred_val = tree.predict(x_val)
                metrics.append(get_metrics(y_val, y_pred_val))

            a, p, r, f = zip(*metrics)
            mean_val_accuracy = np.mean(a)
            mean_val_precision = np.mean(p)
            mean_val_recall = np.mean(r)
            mean_val_f1 = np.mean(f)
            print(
                f'decision tree cv metrics for max_depth={max_depth} min_size=5:'
            )
            print('avg accuracy:', mean_val_accuracy)
            print('avg precision:', mean_val_precision)
            print('avg recall:', mean_val_recall)
            print('avg f1 score:', mean_val_f1)

    if args.make_predictions:
        '''
        # using k-fold cross validation to select the best hyperparameters for the decision tree model
        param_grid = {
            'max_depth': np.arange(3, 11),
            'min_size': [2, 3, 5, 8, 10]
        }
        final_model = GridSearchCV(tree, param_grid, cv=5, scoring='f1', refit=True, verbose=10)
        final_model.fit(x_train, y_train)
        print(final_model.best_params_)
        '''
        # {'max_depth': 10, 'min_size': 3}

        final_model = DecisionTree(max_depth=10, min_size=3)

        kfold = KFold(n_splits=5)
        metrics = []
        aucs = []
        for fit_index, val_index in kfold.split(x_train, y_train):
            x_fit, y_fit = x_train.iloc[fit_index], y_train[fit_index]
            x_val, y_val = x_train.iloc[val_index], y_train[val_index]

            print(
                'running best model cv, decision tree with max_depth=10 min_size=3'
            )
            final_model.fit(x_fit, y_fit)
            y_pred_val = final_model.predict(x_val)
            metrics.append(get_metrics(y_val, y_pred_val))
            aucs.append(roc_auc_score(y_val, y_pred_val))

        a, p, r, f = zip(*metrics)
        mean_val_accuracy = np.mean(a)
        mean_val_precision = np.mean(p)
        mean_val_recall = np.mean(r)
        mean_val_f1 = np.mean(f)
        mean_val_auc = np.mean(aucs)
        print('best model cv metrics:')
        print('avg accuracy:', mean_val_accuracy)
        print('avg precision:', mean_val_precision)
        print('avg recall:', mean_val_recall)
        print('avg f1 score:', mean_val_f1)
        print('avg auc:', mean_val_auc)

        final_model.fit(x_train, y_train)
        y_pred_train = final_model.predict(x_train)
        accuracy, precision, recall, f1 = get_metrics(y_train, y_pred_train)
        auc = roc_auc_score(y_train, y_pred_train)
        print('best model training set metrics:')
        print('accuracy:', accuracy)
        print('precision:', precision)
        print('recall:', recall)
        print('f1 score:', f1)
        print('auc:', auc)

        y_pred = final_model.predict(x_test)
        pd.Series(y_pred).to_csv(os.path.join(PREDICTIONS_DIR, 'best.csv'),
                                 index=False,
                                 header=False)

    if args.add_index:
        y_pred = pd.read_csv(os.path.join(PREDICTIONS_DIR, 'best.csv'),
                             header=None)
        y_pred.to_csv(os.path.join(PREDICTIONS_DIR, 'best.csv'),
                      header=None)  # this adds the index by default
        print('finished adding index')
Example #4
0
def evaluate_performance():
    '''
    Evaluate the performance of decision trees and logistic regression,
    average over 1,000 trials of 10-fold cross validation

    Return:
      a matrix giving the performance that will contain the following entries:
      stats[0,0] = mean accuracy of decision tree
      stats[0,1] = std deviation of decision tree accuracy
      stats[1,0] = mean accuracy of logistic regression
      stats[1,1] = std deviation of logistic regression accuracy

    ** Note that your implementation must follow this API**
    '''

    # Load Data
    filename = 'data/SPECTF.dat'
    data = np.loadtxt(filename, delimiter=',')
    X = data[:, 1:]
    y = np.array([data[:, 0]]).T
    n, d = X.shape

    for trial in range(1000):
        # TODO: shuffle for each of the trials.
        # the following code is for reference only.
        idx = np.arange(n)
        np.random.seed(13)
        np.random.shuffle(idx)
        X = X[idx]
        y = y[idx]

        # TODO: write your own code to split data (for cross validation)
        # the code here is for your reference.
        Xtrain = X[1:101, :]  # train on first 100 instances
        Xtest = X[101:, :]
        ytrain = y[1:101, :]  # test on remaining instances
        ytest = y[101:, :]

        # train the decision tree
        classifier = DecisionTree(100)
        classifier.fit(Xtrain, ytrain)

        # output predictions on the remaining data
        y_pred = classifier.predict(Xtest)
        accuracy = accuracy_score(ytest, y_pred)
        break

    # compute the training accuracy of the model
    meanDecisionTreeAccuracy = np.mean(all_accuracies)

    # TODO: update these statistics based on the results of your experiment
    stddevDecisionTreeAccuracy = 0
    meanLogisticRegressionAccuracy = 0
    stddevLogisticRegressionAccuracy = 0
    meanRandomForestAccuracy = 0
    stddevRandomForestAccuracy = 0

    # make certain that the return value matches the API specification
    stats = np.zeros((2, 2))
    stats[0, 0] = meanDecisionTreeAccuracy
    stats[0, 1] = stddevDecisionTreeAccuracy
    stats[1, 0] = meanRandomForestAccuracy
    stats[1, 1] = stddevRandomForestAccuracy
    stats[2, 0] = meanLogisticRegressionAccuracy
    stats[2, 1] = stddevLogisticRegressionAccuracy
    return stats
Example #5
0
def run(retrain=True):

    tree = DecisionTree()
    if retrain == False:
        tree = DecisionTree().load('./model/tree.pkl')
    else:
        tree.fit(train_set, train_labels)
    # arr = []
    # Without Early Stopping
    print("-----------------------------")
    print("Without Early Stopping")
    print("-----------------------------")
    print("Height: {} | Terminal Nodes: {}".format(tree.height, tree.leaves))
    print("Train Accuracy: ", tree.accuracy(train_set, train_labels))
    print("Validation Accuracy: ",
          tree.accuracy(validation_set, validation_labels))
    print("Test Accuracy: ", tree.accuracy(test_set, test_labels))
    print("Number of times an attribute is used as the splitting function:")
    print("Feature (Polarity Bin): Frequency")
    for feature in range(feature_count):
        print('%.2f' % bins[feature], ': ', tree.attribute_frequency[feature])
        # arr.append((bins[feature], tree.attribute_frequency[feature]))

    # arr = np.array(arr)
    # np.savetxt("./graphs/freq.csv", arr, delimiter=",")
    print("-----------------------------")
    print("-----------------------------")

    if retrain == False:
        # With Early Stopping
        # Height = 40
        tree = DecisionTree().load('./model/early_stopping/tree_height.pkl')
        print("Early Stopping by Max Height = 40")
        print("-----------------------------")
        print("Height: {} | Terminal Nodes: {}".format(tree.height,
                                                       tree.leaves))
        print("Train Accuracy: ", tree.accuracy(train_set, train_labels))
        print("Validation Accuracy: ",
              tree.accuracy(validation_set, validation_labels))
        print("Test Accuracy: ", tree.accuracy(test_set, test_labels))
        print(
            "Number of times an attribute is used as the splitting function:")
        print("Feature (Polarity Bin): Frequency")
        for feature in range(feature_count):
            print('%.2f' % bins[feature], ': ',
                  tree.attribute_frequency[feature])

        print("-----------------------------")
        print("-----------------------------")
        # IG Threshold = 1e-2
        tree = DecisionTree().load('./model/early_stopping/tree_ig.pkl')
        print("Early Stopping by Information Gain Threshold = 0.01")
        print("-----------------------------")
        print("Height: {} | Terminal Nodes: {}".format(tree.height,
                                                       tree.leaves))
        print("Train Accuracy: ", tree.accuracy(train_set, train_labels))
        print("Validation Accuracy: ",
              tree.accuracy(validation_set, validation_labels))
        print("Test Accuracy: ", tree.accuracy(test_set, test_labels))

        print(
            "Number of times an attribute is used as the splitting function:")
        print("Feature (Polarity Bin): Frequency")
        for feature in range(feature_count):
            print('%.2f' % bins[feature], ': ',
                  tree.attribute_frequency[feature])

        print("-----------------------------")
        print("-----------------------------")

    else:

        height_options = [0, 5, 10, 20, 40, 60]

        for height in height_options:
            tree = DecisionTree(max_height=height)
            tree.fit(train_set, train_labels)
            print("Early Stopping by Max Height = {}".format(height))
            print("-----------------------------")
            print("Height: {} | Terminal Nodes: {}".format(
                tree.height, tree.leaves))
            print("Train Accuracy: ", tree.accuracy(train_set, train_labels))
            print("Validation Accuracy: ",
                  tree.accuracy(validation_set, validation_labels))
            print("Test Accuracy: ", tree.accuracy(test_set, test_labels))
            print(
                "Number of times an attribute is used as the splitting function:"
            )
            print("Feature (Polarity Bin): Frequency")
            for feature in range(feature_count):
                print('%.2f' % bins[feature], ': ',
                      tree.attribute_frequency[feature])

            print("-----------------------------")

        ig_options = [0, 1e-4, 1e-3, 1e-2]

        for ig in ig_options:
            tree = DecisionTree(ig_threshold=ig)
            tree.fit(train_set, train_labels)
            print("Early Stopping by Information Gain = {}".format(ig))
            print("-----------------------------")
            print("Height: {} | Terminal Nodes: {}".format(
                tree.height, tree.leaves))
            print("Train Accuracy: ", tree.accuracy(train_set, train_labels))
            print("Validation Accuracy: ",
                  tree.accuracy(validation_set, validation_labels))
            print("Test Accuracy: ", tree.accuracy(test_set, test_labels))
            print(
                "Number of times an attribute is used as the splitting function:"
            )
            print("Feature (Polarity Bin): Frequency")
            for feature in range(feature_count):
                print('%.2f' % bins[feature], ': ',
                      tree.attribute_frequency[feature])

            print("-----------------------------")
 def test_entropy(self):
     decision_tree = DecisionTree()
     labels = np.array([1, 1])
     self.assertAlmostEqual(decision_tree._entropy(labels), 0)
 def test_iterate(self):
     decision_tree = DecisionTree()
     data = np.array([[1], [2], [3]])
     labels = np.array([0, 0, 1])
     node = decision_tree._iterate(data, labels)
     self.assertTrue(isinstance(node, DecisionNode))
Example #8
0
 def randomize(self):
     for i in range(0, self.size):
         self.genes.append(DecisionTree(6, self.limits))
Example #9
0
 def mutate(self):
     if random.randint(0, 10) == 0:
         gene = random.randint(0, self.size - 1)
         self.genes[gene] = DecisionTree(5, self.limits)
Example #10
0
                 "Rating": "category"}

if __name__ == "__main__":
    # Move to data directory
    #cwd = os.getcwd()
    #os.chdir(cwd + "\\data")

    for i in range(1, 5):
        # Read in synth_data
        synth_data = pd.read_csv("data/synthetic-{}.csv".format(i),
                                names=synth_data_names,
                                dtype=synth_data_types)
        print("synthetic-{} data".format(i))

        # Train tree (default bin values of 10)
        tree = DecisionTree(10)
        tree.fit(synth_data, limit=3)
        print(RenderTree(tree.root))
        results = tree.predict(synth_data)
        num_correct = sum(row["label"] == synth_data.at[index, "label"]
                            for index, row in results.iterrows())
        accuracy = num_correct / len(synth_data)
        print("accuracy: {}".format(accuracy))

        # Plot decision surface of best decision tree for current data;
        # Following example plots from scikitlearn:
        # https://scikit-learn.org/0.15/auto_examples/tree/plot_iris.html

        # Make subplot
        plt.subplot(2, 2, i, title="synthetic-{} data".format(i))
 def __init__(self, num_trees):
     # TODO: do initialization here.
     self.num_trees = num_trees
     self.decision_trees = [DecisionTree() for i in range(num_trees)]
        counts_array = np.zeros((len(X), len(self._classes)))  # n_samples x n_classes
        for row_index, count in enumerate(counts):
            for class_index, class_ in enumerate(self._classes):
                counts_array[row_index, class_index] = count[class_]
        proba = counts_array / self._n_trees  # 規格化する
        return proba

if __name__ == '__main__':

    iris = datasets.load_iris()
    X = iris.data
    y = iris.target

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

    dt = DecisionTree()
    dt.fit(X_train, y_train)

    rf = RandomForest()
    rf.fit(X_train, y_train)

    print('DecisionTree: ')

    # dt_predicted_y_train = dt.predict(X_train)
    # print('  predicted_y_train: {}'.format(dt_predicted_y_train))
    # print('  (actual)         : {}'.format(y_train))
    print('  score_train: {}'.format(dt.score(X_train, y_train)))
    # dt_predicted_y_test = dt.predict(X_test)
    # print('  predicted_y_test: {}'.format(dt_predicted_y_test))
    # print('  (actual)        : {}'.format(y_test))
    print('  score_test: {}'.format(dt.score(X_test, y_test)))
Example #13
0
        f.tight_layout()
        return f


def plot3d(data, classes):
    f = plt.figure()
    ax = f.add_subplot(111, projection='3d')
    ax.scatter([p[0] for p in data], [p[1] for p in data], [p[2] for p in data], c=[classlabel for classlabel in classes])
    return f

if __name__ == '__main__':
    data, classes = load_data()
    f = plot3d(data, classes)
    f = plot2d(data, classes)
    tree = DecisionTree(data, classes, maxDepth=2)

    v1 = np.array([4.1, -0.1, 2.2])
    v2 = np.array([6.1, 0.4, 1.3])

    print("Prediction for {}: {}".format(v1, tree.predict(v1)))
    print("Prediction for {}: {}".format(v2, tree.predict(v2)))

    newdata = list()
    newclasses = list()
    for x1 in np.linspace(0, 10, 10):
        for x2 in np.linspace(-1, 1, 10):
            for x3 in np.linspace(0, 6, 10):
                v = np.array([x1, x2, x3])
                newdata.append(v)
                newclasses.append(tree.predict(v))
Example #14
0
                           action='store',
                           dest='file',
                           help='name of the dataset',
                           required=True)
    argparser.add_argument(
        '-m',
        '-measure',
        action='store',
        dest='measure',
        help=
        'Choose criterion, it can be information_gain or gini. By default: information_gain',
        default='information_gain')
    argparser.add_argument('-p',
                           '-prune',
                           action='store',
                           dest='prune',
                           help='Set prune. By default: ""',
                           default="")
    results = argparser.parse_args()

    dataset = pd.read_csv(f'datasets/{results.file}')

    attribute_list = list(dataset.columns)[:-1]
    x_data, y_data = np.split(dataset, [-1], axis=1)
    x_train, x_test, y_train, y_test = train_test_split(x_data, y_data)

    decision_tree = DecisionTree(results.measure, results.prune)

    decision_tree.fit(x_train, y_train, attribute_list)

    print(decision_tree.score(x_test, y_test))
Example #15
0
 def __init__(self, num_trees):
     self.num_trees = num_trees
     self.decision_trees = [DecisionTree() for i in range(num_trees)]
Example #16
0
def evaluate_performance():
    '''
    evaluate_performance a function that evaluates the performance of decision trees and logistic regression,
    averages over 1,000 trials of 10-fold cross validation

    Return:
      a matrix giving the performance that will contain the following entries:
      stats[0,0] = mean accuracy of decision tree
      stats[0,1] = std deviation of decision tree accuracy
      stats[1,0] = mean accuracy of logistic regression
      stats[1,1] = std deviation of logistic regression accuracy
      
    '''

    # Load Data
    filename = 'data/SPECTF.dat'
    data = np.loadtxt(filename, delimiter=',')
    X = data[:, 1:]
    y = np.array([data[:, 0]]).T
    n, d = X.shape

    accuracies_tree = []
    accuracies_forest = []
    #accuracies_logistic = []
    for trial in range(1000):
        if trial % 100 == 0:
            print(trial)
        # TODO: shuffle for each of the trials.
        # the following code is for reference only.
        idx = np.arange(n)
        np.random.shuffle(idx)
        X = X[idx]
        y = y[idx]

        # TODO: write your own code to split data (for cross validation)
        # the code here is for your reference.
        Xtrain = X[0:100, :]  # train on first 100 instances
        Xtest = X[100:, :]
        ytrain = y[0:100, :]  # test on remaining instances
        ytest = y[100:, :]

        train = (np.hstack((Xtrain, ytrain))).tolist()
        # train the decision tree
        classifier = DecisionTree(100)
        tree = classifier.fit(train)

        # output predictions on the remaining data
        y_pred_tree = classifier.predict(Xtest, tree, [])
        accuracy_tree = accuracy_score(ytest, y_pred_tree)
        accuracies_tree.append(accuracy_tree)

        clt = RandomForest(10, 100)
        forest = clt.fit(Xtrain, ytrain)

        y_pred_forest, conf = clt.predict(Xtest, forest)
        accuracy_forest = accuracy_score(ytest, y_pred_forest)
        accuracies_forest.append(accuracy_forest)

    # compute the training accuracy of the model
    meanDecisionTreeAccuracy = np.mean(accuracies_tree)
    stddevDecisionTreeAccuracy = np.std(accuracies_tree)
    #meanLogisticRegressionAccuracy = 0
    #stddevLogisticRegressionAccuracy = 0
    meanRandomForestAccuracy = np.mean(accuracies_forest)
    stddevRandomForestAccuracy = np.std(accuracies_forest)

    # make certain that the return value matches the API specification
    stats = np.zeros((2, 2))
    stats[0, 0] = meanDecisionTreeAccuracy
    stats[0, 1] = stddevDecisionTreeAccuracy
    stats[1, 0] = meanRandomForestAccuracy
    stats[1, 1] = stddevRandomForestAccuracy
    #stats[2, 0] = meanLogisticRegressionAccuracy
    #stats[2, 1] = stddevLogisticRegressionAccuracy
    return stats
Example #17
0
        # PLOT RESULT
        utils.plotClassifier(model, X, y)

        fname = os.path.join("..", "figs", "q2_2_decisionBoundary.pdf")
        plt.savefig(fname)
        print("\nFigure saved as '%s'" % fname)

    elif question == "2.3":
        # 1. Load citiesSmall dataset
        dataset = load_dataset("citiesSmall.pkl")
        X = dataset["X"]
        y = dataset["y"]

        # 2. Evaluate decision tree
        model = DecisionTree(max_depth=2)
        model.fit(X, y)

        y_pred = model.spredict(X)
        error = np.mean(y_pred != y)

        print("Error: %.3f" % error)

    elif question == "2.4":
        dataset = load_dataset("citiesSmall.pkl")
        X = dataset["X"]
        y = dataset["y"]
        print("n = %d" % X.shape[0])

        depths = np.arange(1, 15)  # depths to try
Example #18
0

def get_data_set():
    data_set = list()
    df = pd.read_csv('xg.csv')
    for i, row in df.iterrows():
        temp_dict = dict()
        temp_dict['id'] = row[0]
        temp_dict['class'] = row[-1]
        temp_dict['feature'] = list(row[1: -1])
        data_set.append(temp_dict)
    return data_set


if __name__ == '__main__':
    d_set = get_data_set()
    train_set = d_set[0: 10]
    test_set = d_set[10:]
    # train_set = d_set
    print(len(train_set))
    print(len(test_set))
    dt = DecisionTree(train_set, [0, 1, 2, 3, 4, 5])
    t = dt.get_tree_dict()
    json.dump(t, open('p.json', 'w'))
    correction = 0
    for data in test_set:
        if data['class'] == dt.predict(data):
            correction += 1
    accuracy = correction / len(test_set)
    print('accuracy', accuracy)
 def test_square_loss(self):
     decision_tree = DecisionTree()
     labels = np.array([0, 0])
     self.assertAlmostEqual(decision_tree._square_loss(labels), 0)
Example #20
0
        te_error = np.mean(y_pred != y_test)
        print("Training error: %.3f" % tr_error)
        print("Testing error: %.3f" % te_error)

    elif question == "1.1":
        with open(os.path.join('..', 'data', 'citiesSmall.pkl'), 'rb') as f:
            dataset = pickle.load(f)

        X, y = dataset["X"], dataset["y"]
        X_test, y_test = dataset["Xtest"], dataset["ytest"]

        depths = np.arange(1, 15)  # depths to try

        my_tree_training_errors = np.zeros(depths.size)
        for i, max_depth in enumerate(depths):
            model = DecisionTree(max_depth=max_depth)
            model.fit(X, y)
            y_pred = model.predict(X)
            my_tree_training_errors[i] = np.mean(y_pred != y)
        plt.plot(depths, my_tree_training_errors, label="trainingerrorrate")

        my_tree_testing_errors = np.zeros(depths.size)
        for i, max_depth in enumerate(depths):
            model = DecisionTree(max_depth=max_depth)
            model.fit(X_test, y_test)
            y_pred = model.predict(X_test)
            my_tree_testing_errors[i] = np.mean(y_pred != y_test)
        plt.plot(depths, my_tree_testing_errors, label="testingerrorrate")

        plt.xlabel("Depth of tree")
        plt.ylabel("Classification error")
 def test_gini(self):
     decision_tree = DecisionTree()
     labels = np.array([0, 1])
     self.assertAlmostEqual(decision_tree._gini(labels), 0.5)
Example #22
0
    correct = 0
    training_df = DataFrame.from_array(training_set,
                                       ['bmi', 'weight', 'class'])
    decision_tree.fit(training_df)
    for test in testing_set:
        test_dict = {'bmi': test[0], 'weight': test[1]}
        if forest:
            prediction = decision_tree.predict(test_dict)
        else:
            prediction = decision_tree.classify(test_dict)
        if prediction == test[2]:
            correct += 1
    return correct, len(testing_set)


dt = DecisionTree('gini')

total_correct = 0
total = 0
for i in range(len(splits[0])):
    print(i + 1, 'testing set')
    results = run_tests(splits[0][i], splits[1][i], dt)
    total += results[1]
    total_correct += results[0]

print(total_correct, total)

forests = [1, 10, 100, 1000]

for num in forests:
    dt = RandomForest(num)
Example #23
0
 def __init__(self, num_trees, max_depth=-1, min_gain=0):
     # Initialization done here
     self.num_trees = num_trees
     self.decision_trees = [DecisionTree() for i in range(num_trees)]
     self.max_depth = max_depth
     self.min_gain = min_gain
Example #24
0
new_feature_matrix = np.concatenate((feature_matrix, target.T), axis=1)
# print(len(new_feature_matrix[0]))

# print(len(feature_matrix))
# print(len(feature_matrix[0]))

with open('feature_matrix.csv', 'w', newline='') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    for row in feature_matrix:
        wr.writerow(row)

df = pd.DataFrame(new_feature_matrix)
df = df.iloc[1:]
# print(df.head())

decision_model = DecisionTree(df)
X_train, X_test, y_train, y_test = decision_model.split_dataset()
gini_clf = decision_model.train_gini(X_train, y_train)
entropy_clf = decision_model.train_entropy(X_train, y_train)

print("using gini index:")
gini_pred = decision_model.prediction(X_test, gini_clf)
decision_model.accuracy(y_test, gini_pred)

print("using entropy:")
entropy_pred = decision_model.prediction(X_test, entropy_clf)
decision_model.accuracy(y_test, entropy_pred)

filename = input(f"Please enter the file name:\n")
path = os.getcwd()
test_data = Preprocess(path + '\\' + filename)
Example #25
0
                                       ['bmi', 'weight', 'class'])
    decision_tree.fit(training_df)
    for test in testing_set:
        test_dict = {'bmi': test[0], 'weight': test[1]}
        if forest:
            prediction = decision_tree.predict(test_dict)
        else:
            prediction = decision_tree.classify(test_dict)
        if prediction == test[2]:
            correct += 1
        else:
            indices.append(df.to_array().index(test))
    return correct, len(testing_set)


dt = DecisionTree('gini', max_depth=4)

total_correct = 0
total = 0
# for i in range(len(splits[0])):
#     if i == 0:
# print(len(splits[1][i]))
# print(i+1,'testing set')
print('HALFWAY TRAINING SET')
results = run_tests(splits[0], splits[1], dt)
total += results[1]
total_correct += results[0]

dt_results = {}

dt_results['dt_gini'] = round(total_correct / total, 4)
Example #26
0
        utils.plotClassifier(model, X, y)

        fname = os.path.join("..", "figs", "q2_1_decisionBoundary.pdf")
        plt.savefig(fname)
        print("\nFigure saved as '%s'" % fname)

    elif question == "2.3":
        # Q2.3 - Decision Tree with depth 2

        # 1. Load citiesSmall dataset
        dataset = utils.load_dataset("citiesSmall")
        X = dataset["X"]
        y = dataset["y"]

        # 2. Evaluate decision tree
        model = DecisionTree(max_depth=2)
        model.fit(X, y)

        y_pred = model.predict(X)
        error = np.mean(y_pred != y)

        print("Error: %.3f" % error)
    
    elif question == "2.4":
        dataset = utils.load_dataset("citiesSmall")
        X = dataset["X"]
        y = dataset["y"]
        print("n = %d" % X.shape[0])

        depths = np.arange(1,15) # depths to try
# split data into train, test
train, test = train_test_split(data, test_size = 0.2, random_state = 42)

# use libary decision tree to fit data and predict
start = time.time()
clf = tree.DecisionTreeClassifier()
clf = clf.fit(train[:,:-1], train[:,-1])
sktree_predictions = clf.predict(test[:,:-1])
end = time.time()
score = accuracy_score(test[:,-1], sktree_predictions)
print(f'Time for sklearn decision tree to finish {datafetcher.name} ({datafetcher.feature} features, {datafetcher.len} observations) fitting and prediction is {end-start:.4f} seconds. Accuracy score is {score}.')
# tree.plot_tree(clf)

# use our decision tree to fit data and predict
# prepare data for my tree

start = time.time()
my_tree = DecisionTree()
my_tree.fit(header[:-1], train)
my_predictions = []
i = 0
for observation in test:
    predicted = my_tree.predict(my_tree.tree, observation)
    my_predictions.append(predicted)

end = time.time()
my_score = accuracy_score(test[:, -1], my_predictions)
print(f'Time for my decisioin tree to finish {datafetcher.name} ({datafetcher.feature} features, {datafetcher.len} observations) fitting and prediction is {end-start:.4f} seconds. Accuracy score is {my_score}.')
# my_tree.print_tree()
Example #28
0
        plt.xlabel("x-coordinate")
        plt.ylabel("y-coordinate")
        plt.legend()

        fname = os.path.join("..", "figs", "q6_3_decisionBoundary.pdf")
        plt.savefig(fname)
        print("\nFigure saved as '%s'" % fname)

    elif question == "6.4":
        with open(os.path.join('..', 'data', 'citiesSmall.pkl'), 'rb') as f:
            dataset = pickle.load(f)

        X = dataset["X"]
        y = dataset["y"]

        model = DecisionTree(max_depth=2, stump_class=DecisionStumpInfoGain)
        model.fit(X, y)

        y_pred = model.predict(X)
        error = np.mean(y_pred != y)

        print("Error: %.3f" % error)

        utils.plotClassifier(model, X, y)

        plt.xlabel("x-coordinate")
        plt.ylabel("y-coordinate")
        plt.legend()

        fname = os.path.join("..", "figs", "q6_4_decisionBoundary.pdf")
        plt.savefig(fname)
Example #29
0
 def __init__(self, num_trees):
     # Initialization done here
     self.num_trees = num_trees
     self.decision_trees = [DecisionTree() for i in range(num_trees)]
def create_tree(verbose=False):
    """
    Parameters
    ----------
    verbose: boolean

    Returns
    -------
    tree: DecisionTree
    """
    # Load the data.
    trips = get_trips()
    arrival_times_df = get_arrival_times(trips)

    # Assume nan means that the train is late.
    arrival_times_df.fillna(value=30, inplace=True)

    # Split the data into training and testing sets.
    training_dates = []
    tuning_dates = []
    testing_dates = []

    last_training_day = datetime.datetime.strptime('2016-04-30', '%Y-%m-%d')
    last_tuning_day = datetime.datetime.strptime('2017-04-30', '%Y-%m-%d')

    for datestr in arrival_times_df.columns:
        this_date = datetime.datetime.strptime(datestr, '%Y-%m-%d')
        if this_date <= last_training_day:
            training_dates.append(datestr)
        if this_date <= last_tuning_day:
            tuning_dates.append(datestr)
        else:
            testing_dates.append(datestr)

    training_df = arrival_times_df.loc[:, training_dates]
    tuning_df = arrival_times_df.loc[:, tuning_dates]
    testing_df = arrival_times_df.loc[:, testing_dates]

    training_features_df = create_features(list(training_df.columns))
    judge = Judge(training_df)

    # Tune our hyperparameter.
    # Iterate over values for n_min.
    best_tuning_score = 1e10
    best_n_min = 0
    best_tree = None
    for n_min in range(10, 100, 10):

        tree = DecisionTree(err_fn=judge.find_total_absolute_deviation,
                            n_min=n_min)
        tree.train(training_features_df)
        training_score = evaluate(tree, training_df)
        tuning_score = evaluate(tree, tuning_df)

        if tuning_score < best_tuning_score:
            best_tuning_score = tuning_score
            best_n_min = n_min
            best_tree = tree

        if verbose:
            print('n_min', n_min)
            print('training', training_score)
            print('tuning', tuning_score)
            tree.render()

    testing_score = evaluate(best_tree, testing_df)

    if verbose:
        print('best_n_min', best_n_min)
        print('best_tuning', best_tuning_score)
        print('testing score', testing_score)

    return best_tree