def ft_random_forest_testing(x_train, y_train, x_test, y_test):
    print('Random Forest Feature Loop\n\n')
    train_list = []
    test_list = []
    F1_list = []

    for i in [1, 2, 5, 8, 10, 20, 25, 35, 50]:
        rclf = RandomForestClassifier(max_depth=7, max_features=i, n_trees=50)
        rclf.fit(x_train, y_train)
        preds_train = rclf.predict(x_train)
        preds_test = rclf.predict(x_test)
        train_accuracy = accuracy_score(preds_train, y_train)
        test_accuracy = accuracy_score(preds_test, y_test)
        print('Train {}'.format(train_accuracy))
        print('Test {}'.format(test_accuracy))
        preds = rclf.predict(x_test)
        print('F1 Test {}'.format(f1(y_test, preds)))

        # Grab the useful number per cycle
        train_list.append(train_accuracy)
        test_list.append(test_accuracy)
        F1_list.append(f1(y_test, preds))

    plt.rcParams['font.family'] = ['serif']
    x = [1, 2, 5, 8, 10, 20, 25, 35, 50]
    ax = plt.subplot(111)
    ax.plot(x, train_list, label='training')
    ax.plot(x, test_list, label='testing')
    ax.plot(x, F1_list, label='F1')
    plt.xlabel("max_features")
    plt.xticks(x)
    plt.ylabel("Accuracies")
    ax.legend()
    plt.savefig("RandomForestFeatures.png")
    plt.clf()
Example #2
0
 def test_adaboost(self):
     train_X,train_y,test_X,test_y = loadHorseColic()
     adaboost = AdaBoostClassifier()
     adaboost.fit(train_X,train_y)
     preds = adaboost.predict(test_X)
     print(accuracy_score(preds,test_y))
     assert accuracy_score(preds,test_y)>0.7
Example #3
0
def run_binary():
    print('Performing binary classification on synthetic data')
    X_train, X_test, y_train, y_test = toy_data_binary()
    logiRegr = LogisticRegressionWithL2(alpha=.1)

    logiRegr.binary_train(X_train, y_train)

    train_preds = logiRegr.binary_predict(X_train)
    preds = logiRegr.binary_predict(X_test)
    print(
        'train acc: %f, test acc: %f' %
        (accuracy_score(y_train, train_preds), accuracy_score(y_test, preds)))

    print('Performing binary classification on binarized MNIST')
    X_train, X_test, y_train, y_test = data_loader_mnist()
    logiRegr = LogisticRegressionWithL2(alpha=.1)

    binarized_y_train = [0 if yi < 5 else 1 for yi in y_train]
    binarized_y_test = [0 if yi < 5 else 1 for yi in y_test]

    logiRegr.binary_train(X_train, binarized_y_train)

    train_preds = logiRegr.binary_predict(X_train)
    preds = logiRegr.binary_predict(X_test)
    print('train acc: %f, test acc: %f' %
          (accuracy_score(binarized_y_train, train_preds),
           accuracy_score(binarized_y_test, preds)))
Example #4
0
def decision_tree_various_depth(x_train, y_train, x_test, y_test):
    print('Decision Tree with depths 1-25 (inclusive)\n')

    # these will keep our points
    graphTrain = []
    graphTest = []
    graphF1 = []

    # perform decision tree testing for each depth
    # i'd like to use the decision_tree_testing function here, but we need to set the proper depth for each iteration
    for layer in range(1, 26):
        print('Current depth: ', layer)
        clf = DecisionTreeClassifier(max_depth=layer)
        clf.fit(x_train, y_train)
        preds_train = clf.predict(x_train)
        preds_test = clf.predict(x_test)
        graphTrain.append(accuracy_score(preds_train, y_train))
        graphTest.append(accuracy_score(preds_test, y_test))
        print('Train {}'.format(accuracy_score(preds_train, y_train)))
        print('Test {}'.format(accuracy_score(preds_test, y_test)))
        preds = clf.predict(x_test)
        print('F1 Test {}\n'.format(f1(y_test, preds)))
        graphF1.append(f1(y_test, preds))

    table = pd.DataFrame({
        "Max Depth": [item for item in range(1, 26)],
        "Train Accuracy": graphTrain,
        "Test Accuracy": graphTest,
        "F1 Accuracy": graphF1
    })
    print(table)

    # plot our graph and output to a file
    plt.xlabel('Depth')
    plt.ylabel('Performance')
    plt.title('Accuracy & F1 Score vs Number of Trees')
    plt.plot('Max Depth', 'Train Accuracy', data=table, color='blue')
    plt.plot('Max Depth', 'Test Accuracy', data=table, color='green')
    plt.plot('Max Depth', 'F1 Accuracy', data=table, color='red')
    plt.legend()
    plt.savefig('q1.png')

    # get best depth in terms of validation accuracy
    topAccuracy = max(graphF1)
    print("The depth that gives the best validation accuracy is: ",
          [item for item in range(1, 26)][graphF1.index(topAccuracy)],
          "which has an F1 accuracy of ", topAccuracy)

    # get the most important feature for making a prediction
    clfMVP = DecisionTreeClassifier(
        max_depth=[item for item in range(1, 26)][graphF1.index(topAccuracy)])
    clfMVP.fit(x_train, y_train)
    print("The most important feature for making a prediction is: ",
          clfMVP.root.feature)
    print("The threshold to split on for this feature is: ", clfMVP.root.split)

    # return the most important feature for use in main
    return clfMVP.root.feature
Example #5
0
def random_forest_various_features(x_train, y_train, x_test, y_test):
    # keep our values to use for max_features
    useFeatures = [1, 2, 5, 8, 10, 20, 25, 35, 50]

    # for whatever reason, same variable names cause issues despite being within local scope
    # so we have to make sure there are no matching variable names even between functions

    graphTrain2 = []
    graphTest2 = []
    graphF12 = []

    # let the user know which test this is
    print("== Beginning test for various max_features.\n")

    for features in useFeatures:
        print("max_features: ", features)
        rclf = RandomForestClassifier(max_depth=7,
                                      max_features=features,
                                      n_trees=50)
        rclf.fit(x_train, y_train)
        preds_train = rclf.predict(x_train)
        preds_test = rclf.predict(x_test)
        graphTrain2.append(accuracy_score(preds_train, y_train))
        graphTest2.append(accuracy_score(preds_test, y_test))
        print('Train {}'.format(accuracy_score(preds_train, y_train)))
        print('Test {}'.format(accuracy_score(preds_test, y_test)))
        preds = rclf.predict(x_test)
        graphF12.append(f1(y_test, preds))
        print('F1 Test {}\n'.format(f1(y_test, preds)))

    # print lengths for debugging
    print("== Length of Train", len(graphTrain2))
    print("== Length of Test", len(graphTest2))
    print("== Length of F1", len(graphF12))

    # table for easily reading data
    table2 = pd.DataFrame({
        "max_features": [i for i in useFeatures],
        "Train Accuracy": graphTrain2,
        "Test Accuracy": graphTest2,
        "F1 Accuracy": graphF12
    })
    print(table2)

    # plot our graph and output to a file
    plt.figure(3)
    plt.xlabel('Max Features')
    plt.ylabel('Performance')
    plt.title('Accuracy & F1 Score vs Max Features')
    plt.plot('max_features', 'Train Accuracy', data=table2, color='blue')
    plt.plot('max_features', 'Test Accuracy', data=table2, color='green')
    plt.plot('max_features', 'F1 Accuracy', data=table2, color='red')
    plt.legend()
    plt.savefig('q2pd.png')

    # return best value for max_features to use in main
    return [feature for feature in useFeatures][graphF12.index(max(graphF12))]
Example #6
0
 def test_knn(self):
     iris = load_iris()
     data = iris['data']
     target = iris['target']
     knn = KNN(k=3, tree='kdtree', distance='euclidean')
     knn.fit(data, array(mat(target).T))
     preds = knn.predict(data)
     print(accuracy_score(preds, target))
     assert accuracy_score(preds, target) > 0.9
Example #7
0
def random_forest_testing(x_train, y_train, x_test, y_test):
    print('Random Forest\n\n')
    rclf = RandomForestClassifier(max_depth=7, max_features=11, n_trees=50)
    rclf.fit(x_train, y_train)
    preds_train = rclf.predict(x_train)
    preds_test = rclf.predict(x_test)
    train_accuracy = accuracy_score(preds_train, y_train)
    test_accuracy = accuracy_score(preds_test, y_test)
    print('Train {}'.format(train_accuracy))
    print('Test {}'.format(test_accuracy))
    preds = rclf.predict(x_test)
    print('F1 Test {}'.format(f1(y_test, preds)))
Example #8
0
 def test_linear_regression(self):
     lr = LinearRegression(learning_rate=1e-6,
                           max_iter=1000,
                           threshold=1e-4)
     train_X, train_y, test_X, test_y = split_train_test(data,
                                                         labels,
                                                         scale=0.7,
                                                         is_random=True)
     lr.fit(train_X, train_y)
     preds = lr.predict(test_X)
     print(accuracy_score(preds, test_y))
     assert accuracy_score(preds, test_y) > 0.8
def decision_tree_testing(x_train, y_train, x_test, y_test):
    print('Decision Tree\n\n')
    clf = DecisionTreeClassifier(max_depth=20)
    clf.fit(x_train, y_train)
    preds_train = clf.predict(x_train)
    preds_test = clf.predict(x_test)
    train_accuracy = accuracy_score(preds_train, y_train)
    test_accuracy = accuracy_score(preds_test, y_test)
    print('Train {}'.format(train_accuracy))
    print('Test {}'.format(test_accuracy))
    preds = clf.predict(x_test)
    print('F1 Test {}'.format(f1(y_test, preds)))
def create_trees(x_train, y_train, x_test, y_test, maxdepth):
    #print('Decision Tree\n\n')
    clf = DecisionTreeClassifier(max_depth=maxdepth)
    clf.fit(x_train, y_train)
    preds_train = clf.predict(x_train)
    preds_test = clf.predict(x_test)
    train_accuracy = accuracy_score(preds_train, y_train)
    test_accuracy = accuracy_score(preds_test, y_test)
    #print('Train {}'.format(train_accuracy))
    #print('Test {}'.format(test_accuracy))
    preds = clf.predict(x_test)
    #print('F1 Test {}'.format(f1(y_test, preds)))
    return (f1(y_test, preds)), train_accuracy, test_accuracy
Example #11
0
def adaboost_testing(x_train, y_train, x_test, y_test, M):
	print("Adaboost Tree\n\n")
	aclf = AdaBoostClassifier(max_depth = 1)
	aclf.fit(x_train, y_train, M)
	preds_train = aclf.predict(x_train)
	preds_test = aclf.predict(x_test)
	train_accuracy = accuracy_score(preds_train, y_train)
	test_accuracy = accuracy_score(preds_test, y_test)
	print('Train {}'.format(train_accuracy))
	print('Test {}'.format(test_accuracy))
	preds = aclf.predict(x_test)
	print('F1 Test {}'.format(f1(y_test, preds)))
	preds_train = aclf.predict(x_train)
	return train_accuracy, test_accuracy, f1(y_train, preds_train), f1(y_test, preds)
def run_random_forest(data, target_column):
    st.sidebar.title('Choose parameters for Random Forest')
    ts = st.sidebar.slider('Training size', min_value=0.0, max_value=1.0, step=0.01, value=0.7)
    n_estimators = st.sidebar.number_input('n_estimators', min_value=1, max_value=1000, step=1)
    n_features = st.sidebar.number_input('n_features', min_value=1, max_value=len(data.columns)-1, step=1, value=len(data.columns)-1)
    bootstrap_size = st.sidebar.number_input('bootstrap_size', min_value=1, max_value=int(len(data)*ts), step=1, value=int(len(data)*ts))
    if st.sidebar.checkbox('Specify Depth'):
        max_depth = st.sidebar.number_input('max_depth', min_value=1, max_value=int(len(data)*ts), step=1)
    else:
        max_depth = None
    run_status = st.sidebar.button('Run Algorithm')
    if run_status:
        with st.spinner('Running...'):
            x_train, x_test, y_train, y_test = train_test_split(data.drop([target_column], axis=1),
                                                                data[target_column],
                                                                test_size=1 - ts)
            clf = RandomForest(n_estimators=n_estimators,
                               n_features=n_features,
                               max_depth=max_depth,
                               bootstrap_size=bootstrap_size)
            clf.fit(x_train, y_train)
            """
            ## :dart: Accuracy
            """
            st.subheader(accuracy_score(y_test, clf.predict(x_test)))
    def test_randomforest_classifier(self):

        rf = RandomForestClassifier(n_estimators=30,sample_scale=0.67,feature_scale=0.6)
        rf.fit(mat(data),target)
        preds = rf.predict(mat(data))

        assert accuracy_score(preds,target)>0.95
def forwardPropagation(input, weights, bias, originalOutput,
                       binarizedTruePrediction, prediction, numberOfSamples,
                       numberOfNeuronsInLayers, classes, optimizer):
    #computing output of first hidden layer
    h1In = numpy.dot(input[:, :3], weights[0]) + numpy.repeat(
        numpy.array([bias[0]]), repeats=[numberOfSamples], axis=0)
    h1Output = utils.relu(h1In)

    #computing output of second hidden layer
    h2In = numpy.dot(h1Output, weights[1]) + numpy.repeat(
        numpy.array([bias[1]]), repeats=[numberOfSamples], axis=0)
    h2Output = utils.relu(h2In)

    #computing output of the output layer
    OIn = numpy.dot(h2Output, weights[2]) + numpy.repeat(
        numpy.array([bias[2]]), repeats=[numberOfSamples], axis=0)
    OOutput = utils.softmax(OIn)

    myPredictedValueListAsIntegers = numpy.argmax(OOutput, axis=1)
    # Computing overall error only for plotting graph
    if prediction == False:
        errorForGraph = utils.log_loss(binarizedTruePrediction,
                                       OOutput[:] + 0.00001)
        errorForPlottingGraphList.append(errorForGraph)
        accuracyScoreForGraph.append(
            utils.accuracy_score(originalOutput,
                                 myPredictedValueListAsIntegers))
        backPropagation(input, OOutput, originalOutput,
                        binarizedTruePrediction, h1Output, h2Output,
                        numberOfNeuronsInLayers, classes, h1In, h2In, OIn,
                        optimizer)
    else:
        return myPredictedValueListAsIntegers
Example #15
0
def main():
    data = datasets.load_digits()
    X = normalize(data.data)
    y = data.target

    # One-hot encoding of nominal y-values
    y = to_categorical(y)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        seed=8)

    # Perceptron
    clf = Perceptron(n_iterations=5000,
                     learning_rate=0.001,
                     loss=CrossEntropy,
                     activation_function=Sigmoid)
    clf.fit(X_train, y_train)

    y_pred = np.argmax(clf.predict(X_test), axis=1)
    y_test = np.argmax(y_test, axis=1)

    accuracy = accuracy_score(y_test, y_pred)

    print("Accuracy:", accuracy)

    # Reduce dimension to two using PCA and plot the results
    Plot().plot_in_2d(X_test,
                      y_pred,
                      title="Perceptron",
                      accuracy=accuracy,
                      legend_labels=np.unique(y))
Example #16
0
def main():
    data = datasets.load_digits()
    X = data.data
    y = data.target

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.4,
                                                        seed=2)
    print("X_train.shape:", X_train.shape)
    print("Y_train.shape:", y_train.shape)

    clf = RandomForest(n_estimators=100)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)

    print("Accuracy:", accuracy)

    Plot().plot_in_2d(X_test,
                      y_pred,
                      title="Random Forest",
                      accuracy=accuracy,
                      legend_labels=data.target_names)
Example #17
0
def main():
    data = datasets.load_digits()
    X = normalize(data.data)
    y = data.target

    # convert the nominal y values to binary
    y = to_categorical(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

    #mlp
    clf = MultilayerPerceptron(n_hidden=16,
                               n_iterations=1000,
                               learning_rate=0.01)
    clf.fit(X_train, y_train)
    y_pred = np.argmax(clf.predict(X_test), axis=1)
    y_test = np.argmax(y_test, axis=1)

    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)

    # Reduce dimension to two using PCA and plot the results
    Plot().plot_in_2d(X_test,
                      y_pred,
                      title="Multilayer Perceptron",
                      accuracy=accuracy,
                      legend_labels=np.unique(y))
def main():
    # Load dataset
    data = datasets.load_iris()
    X = normalize(data.data[data.target != 0])
    y = data.target[data.target != 0]
    y[y == 1] = 0
    y[y == 2] = 1

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        seed=1)

    clf = LogisticRegression()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    y_pred = np.reshape(y_pred, y_test.shape)

    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)

    # Reduce dimension to two using PCA and plot the results
    Plot().plot_in_2d(X_test,
                      y_pred,
                      title="Logistic Regression",
                      accuracy=accuracy,
                      legend_labels=data.target_names)
Example #19
0
def main():
    df = pd.read_csv('fishiris.csv')
    df['target'] = df.apply(create_target, axis=1)
    y = df['target'].to_numpy()
    df = df.drop(['Name', 'target'], axis=1)
    feature_names = df.columns.tolist()
    X = df.to_numpy()
    target_names = ['setosa', 'versicolor', 'virginica']

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.4,
                                                        shuffle=True)
    print('X_train\n', X_train)
    print('y_train\n', y_train)
    print('X_test\n', X_test)
    print('y_test\n', y_test)
    clf = ClassificationTree()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    print('-' * 40, 'print_tree', '-' * 40)
    clf.print_tree(feature_names=feature_names)
    print('-' * 40, 'print_tree', '-' * 40)

    accuracy = accuracy_score(y_test, y_pred)

    print("Accuracy:", accuracy)

    Plot().plot_in_2d(X_test,
                      y_pred,
                      title="Decision Tree",
                      accuracy=accuracy,
                      legend_labels=target_names)
    Plot().plot_in_3d(X_test, y_pred)
Example #20
0
def ada_boost_testing(x_train, y_train, x_test, y_test, num_learner=50):
    print('Ada Boost')
    print(x_train, y_train)
    aba = AdaBoostClassifier(num_learner)
    aba.fit(x_train, y_train)
    preds_train = aba.predict(x_train)
    preds_test = aba.predict(x_test)

    print(preds_train, preds_test)

    train_accuracy = accuracy_score(preds_train, y_train)
    test_accuracy = accuracy_score(preds_test, y_test)
    print('Train {}'.format(train_accuracy))
    print('Test {}'.format(test_accuracy))
    preds = aba.predict(x_test)
    print('F1 Test {}'.format(f1(y_test, preds)))
Example #21
0
def random_forest_various_seeds(x_train, y_train, x_test, y_test,
                                best_max_features, best_n_trees):
    # let the user know which test this is
    print("== Beginning test for best result with random seeds.\n")

    # to hold data points
    randseedTrain = []
    randseedTest = []
    randseedF1 = []
    averageSeeds = []
    averageTrain = []
    averageTest = []
    averageF1 = []
    usedSeeds = []

    rclf = RandomForestClassifier(max_depth=7,
                                  max_features=best_max_features,
                                  n_trees=best_n_trees)

    for item in [i for i in range(10)]:
        rclf.seed = np.random.randint(1, 1000)
        usedSeeds.append(rclf.seed)
        rclf.fit(x_train, y_train)
        preds_train = rclf.predict(x_train)
        preds_test = rclf.predict(x_test)
        randseedTrain.append(accuracy_score(preds_train, y_train))
        randseedTest.append(accuracy_score(preds_test, y_test))
        print('Train {}'.format(accuracy_score(preds_train, y_train)))
        print('Test {}'.format(accuracy_score(preds_test, y_test)))
        preds = rclf.predict(x_test)
        randseedF1.append(f1(y_test, preds))
        print('F1 Test {}\n'.format(f1(y_test, preds)))

    # get averages
    averageSeeds.append("Average")
    averageTrain.append(sum(randseedTrain) / len(randseedTrain))
    averageTest.append(sum(randseedTest) / len(randseedTest))
    averageF1.append(sum(randseedF1) / len(randseedF1))

    # get table for data + add averages at the end
    table3 = pd.DataFrame({
        "Seed": [i for i in usedSeeds] + averageSeeds,
        "Train Accuracy": randseedTrain + averageTrain,
        "Test Accuracy": randseedTest + averageTest,
        "F1 Score": randseedF1 + averageF1
    })
    print(table3)
Example #22
0
def random_forest_tune_MaxFeatures(x_train, y_train, x_test, y_test):
    print('Random Forest tune\n\n')
    plotX = [1, 2, 5, 8, 10, 20, 25, 35, 50]
    plotTrain = []
    plotTest = []
    plotF1 = []

    for max_features in plotX:
        print("MAX_Features: ", max_features)
        rclf = RandomForestClassifier(max_depth=7,
                                      max_features=max_features,
                                      n_trees=50)
        rclf.fit(x_train, y_train)
        preds_train = rclf.predict(x_train)
        preds_test = rclf.predict(x_test)
        train_accuracy = round(accuracy_score(preds_train, y_train), 3)
        test_accuracy = round(accuracy_score(preds_test, y_test), 3)
        print('Train {}'.format(train_accuracy))
        print('Test {}'.format(test_accuracy))
        preds = rclf.predict(x_test)
        F1 = round(f1(y_test, preds), 3)
        print('F1 Test {}'.format(F1))
        print('\n')
        plotTrain.append(train_accuracy)
        plotTest.append(test_accuracy)
        plotF1.append(F1)

    df = pd.DataFrame({
        "MAX_Features": plotX,
        "Train_Accuracy": plotTrain,
        "Test_Accuracy": plotTest,
        "F1_Accuracy": plotF1
    })
    print(df)
    maxAccuracy = max(plotF1)
    best_MAX_Features = plotX[plotF1.index(maxAccuracy)]
    print("The best MAX_Features is ", best_MAX_Features, "with F1 accuracy ",
          maxAccuracy)

    print("Drawing plot")
    plt.plot('MAX_Features', 'Train_Accuracy', data=df, color='red')
    plt.plot('MAX_Features', 'Test_Accuracy', data=df, color='blue')
    plt.plot('MAX_Features', 'F1_Accuracy', data=df, color='black')
    plt.legend()
    plt.savefig('random_forest_output_max_features.png')
    plt.close()
    return best_MAX_Features
Example #23
0
def ada_boost_testing(x_train, y_train, x_test, y_test, num_learner):
	print('Ada Boost and L(', num_learner, ')')
	aba = AdaBoostClassifier(num_learner)
	aba.fit(x_train, y_train)
	preds_train = aba.predict(x_train)
	preds_test = aba.predict(x_test)
	train_accuracy = accuracy_score(preds_train, y_train)
	test_accuracy = accuracy_score(preds_test, y_test)
	print('Train {}'.format(train_accuracy))
	print('Test {}'.format(test_accuracy))
	preds = aba.predict(x_test)
	preds_train = aba.predict(x_train)

	print('F1 Train {}'.format(f1(y_train, preds_train)))
	print('F1 Test {}\n'.format(f1(y_test, preds)))

	return train_accuracy, test_accuracy, f1(y_train, preds_train), f1(y_test, preds)
Example #24
0
def adaBoost_testing(x_train, y_train, x_test, y_test, l):

    clf = AdaBoostClassifier(l)
    clf.fit(x_train, y_train)

    preds_train = clf.predict(x_train)
    preds_test = clf.predict(x_test)
    train_accuracy = round(accuracy_score(preds_train, y_train), 3)
    test_accuracy = round(accuracy_score(preds_test, y_test), 3)
    print("L: ", l)
    print('Train {}'.format(train_accuracy))
    print('Test {}'.format(test_accuracy))
    preds = clf.predict(x_test)
    F1 = round(f1(y_test, preds), 3)
    print('F1 Test {}'.format(F1))
    print()
    return train_accuracy, test_accuracy, F1
 def test_nb_using_iris(self):
     iris = load_iris()
     data = iris['data']
     target = iris['target']
     nb = NaiveBayes()
     nb.fit(data, target)
     preds = nb.predict(data)
     assert accuracy_score(preds, target) > 0.9
Example #26
0
 def test_id3(self):
     id3 = ID3(max_depth=100, max_leafs=100, epsilon=0.00001)
     dataset, labels, feat_labels = load_loans()
     id3.set_feature_labels(feat_labels)
     id3.fit(dataset, labels)
     preds = id3.predict(dataset)
     score = accuracy_score(preds, labels)
     assert score > 0.99
Example #27
0
def random_forest_various_trees(x_train, y_train, x_test, y_test):
    graphTrain = []
    graphTest = []
    graphF1 = []

    # let the user know which test this is
    print("== Beginning test for various n_trees.\n")

    # plot accuracies for the number of trees specified in part b
    for i in range(10, 210, 10):
        print("n_trees: ", i)
        rclf = RandomForestClassifier(max_depth=7, max_features=11, n_trees=i)
        rclf.fit(x_train, y_train)
        preds_train = rclf.predict(x_train)
        preds_test = rclf.predict(x_test)
        graphTrain.append(accuracy_score(preds_train, y_train))
        graphTest.append(accuracy_score(preds_test, y_test))
        print('Train {}'.format(accuracy_score(preds_train, y_train)))
        print('Test {}'.format(accuracy_score(preds_test, y_test)))
        preds = rclf.predict(x_test)
        print('F1 Test {}\n'.format(f1(y_test, preds)))
        graphF1.append(f1(y_test, preds))

    # table for easily reading data
    table = pd.DataFrame({
        "n_trees": [i for i in range(10, 210, 10)],
        "Train Accuracy": graphTrain,
        "Test Accuracy": graphTest,
        "F1 Accuracy": graphF1
    })
    print(table)

    # plot our graph and output to a file
    plt.figure(2)
    plt.xlabel('Number of trees')
    plt.ylabel('Performance')
    plt.title('Accuracy & F1 Score vs Number of Trees in the Forest')
    plt.plot('n_trees', 'Train Accuracy', data=table, color='blue')
    plt.plot('n_trees', 'Test Accuracy', data=table, color='green')
    plt.plot('n_trees', 'F1 Accuracy', data=table, color='red')
    plt.legend()
    plt.savefig('q2pb.png')

    # return our best n__trees value for use in main
    return [i for i in range(10, 210, 10)][graphF1.index(max(graphF1))]
Example #28
0
def random_forest_testing(x_train, y_train, x_test, y_test, n_trees, max_features):
	print('Random Forest')
	print("max_depth: %d, max_features: %d, n_trees: %d" % (7,max_features, n_trees))
	rclf = RandomForestClassifier(n_trees, max_features, max_depth=7)
	rclf.fit(x_train, y_train)
	preds_train = rclf.predict(x_train)
	preds_test = rclf.predict(x_test)
	train_accuracy = accuracy_score(preds_train, y_train)
	test_accuracy = accuracy_score(preds_test, y_test)
	print('Train {}'.format(train_accuracy))
	print('Test {}'.format(test_accuracy))
	preds = rclf.predict(x_test)
	preds_train = rclf.predict(x_train)

	print('F1 Train {}'.format(f1(y_train, preds_train)))
	print('F1 Test {}\n'.format(f1(y_test, preds)))

	return train_accuracy, test_accuracy, f1(y_train, preds_train), f1(y_test, preds)
Example #29
0
def decision_tree_tune(x_train, y_train, x_test, y_test):
    print('Decision Tree tune\n\n')
    plotX = [i for i in range(1, 26)]
    plotTrain = []
    plotTest = []
    plotF1 = []

    for depth in range(1, 26):
        print('Math Depth: ', depth)
        clf = DecisionTreeClassifier(max_depth=depth)
        clf.fit(x_train, y_train)
        preds_train = clf.predict(x_train)
        preds_test = clf.predict(x_test)
        train_accuracy = round(accuracy_score(preds_train, y_train), 3)
        test_accuracy = round(accuracy_score(preds_test, y_test), 3)
        print('Train {}'.format(train_accuracy))
        print('Test {}'.format(test_accuracy))
        preds = clf.predict(x_test)
        F1 = round(f1(y_test, preds), 3)
        print('F1 Test {}'.format(F1))
        print('\n')
        plotTrain.append(train_accuracy)
        plotTest.append(test_accuracy)
        plotF1.append(F1)

    df = pd.DataFrame({
        "Max_Depth": plotX,
        "Train_Accuracy": plotTrain,
        "Test_Accuracy": plotTest,
        "F1_Accuracy": plotF1
    })
    print(df)
    maxAccuracy = max(plotF1)
    bestDepth = plotX[plotF1.index(maxAccuracy)]
    print("The best Depth is ", bestDepth, "with F1 accuracy ", maxAccuracy)

    print("Drawing plot")
    plt.plot('Max_Depth', 'Train_Accuracy', data=df, color='red')
    plt.plot('Max_Depth', 'Test_Accuracy', data=df, color='blue')
    plt.plot('Max_Depth', 'F1_Accuracy', data=df, color='black')
    plt.legend()
    plt.savefig('decision_tree_output.png')
    plt.close()
    return bestDepth
Example #30
0
def decision_tree_testing(x_train, y_train, x_test, y_test, max_depth):
	print('Decision Tree')
	print("depth : %d" % max_depth)
	
	clf = DecisionTreeClassifier(max_depth)
	clf.fit(x_train, y_train)
	preds_train = clf.predict(x_train)
	preds_test = clf.predict(x_test)
	train_accuracy = accuracy_score(preds_train, y_train)
	test_accuracy = accuracy_score(preds_test, y_test)
	print('Train {}'.format(train_accuracy))
	print('Test {}'.format(test_accuracy))
	preds = clf.predict(x_test)
	preds_train =clf.predict(x_train)

	print('F1 Train {}'.format(f1(y_train, preds_train)))
	print('F1 Test {}\n'.format(f1(y_test, preds)))
	
	return train_accuracy, test_accuracy, f1(y_train, preds_train), f1(y_test, preds) 
def main():
    data = datasets.load_digits()
    X = data.data
    y = data.target

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, seed=2)
    print("X_train.shape:", X_train.shape)
    print("Y_train.shape:", y_train.shape)

    clf = RandomForest(n_estimators=100)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)

    print("Accuracy:", accuracy)

    Plot().plot_in_2d(X_test, y_pred, title="Random Forest", accuracy=accuracy, legend_labels=data.target_names)
def main():
    data = datasets.load_digits()
    X = normalize(data.data)
    y = data.target

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
    print("X_train",X_train.shape)
    clf = NaiveBayes()
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)

    print ("Accuracy:", accuracy)

    # Reduce dimension to two using PCA and plot the results
    Plot().plot_in_2d(X_test, y_pred, title="Naive Bayes", accuracy=accuracy, legend_labels=data.target_names)
def main():
    # Load dataset
    data = datasets.load_iris()
    X = normalize(data.data[data.target != 0])
    y = data.target[data.target != 0]
    y[y == 1] = 0
    y[y == 2] = 1

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, seed=1)

    clf = LogisticRegression()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    y_pred = np.reshape(y_pred, y_test.shape)

    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)

    # Reduce dimension to two using PCA and plot the results
    Plot().plot_in_2d(X_test, y_pred, title="Logistic Regression", accuracy=accuracy)
def main():

    print ("-- Classification Tree --")

    data = datasets.load_iris()
    X = data.data
    y = data.target

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

    clf = ClassificationTree()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)

    print ("Accuracy:", accuracy)

    Plot().plot_in_2d(X_test, y_pred,
        title="Decision Tree",
        accuracy=accuracy,
        legend_labels=data.target_names)
def main():

    print ("-- Gradient Boosting Classification --")

    data = datasets.load_iris()
    X = data.data
    y = data.target

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
    print(y_train)

    clf = GBDTClassifier()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)

    print ("Accuracy:", accuracy)


    Plot().plot_in_2d(X_test, y_pred,
        title="Gradient Boosting",
        accuracy=accuracy,
        legend_labels=data.target_names)
Example #36
0
def run(args):
    batch_size = args.batch_size
    num_epochs = args.num_epochs
    num_steps = args.num_time_steps
    num_classes = 10
    num_lstm_units = args.num_lstm_units
    num_lstm_layer = 1
    alpha = args.alpha
    location_sigma = args.location_sigma
    glimpse_size = (12, 12)

    image_rows, image_cols = [int(v) for v in args.image_size.split("x")]

    mnist = input_data.read_data_sets("data", one_hot=True)

    sess = tf.Session()
    K.set_session(sess)

    image = tf.placeholder(tf.float32, (None, image_rows, image_cols, 1))
    label = tf.placeholder(tf.int32, (None, num_classes))

    tf.image_summary("translated mnist", image, max_images=3)

    cell = tf.nn.rnn_cell.LSTMCell(num_lstm_units, forget_bias=1., use_peepholes=True, state_is_tuple=True)
    # cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * num_lstm_layer, state_is_tuple=True)
    state = initial_state = cell.zero_state(tf.shape(image)[0], dtype=tf.float32)

    location_net = Dense(2, activation="linear", name="location_net")
    h_g = Dense(128, activation="relu", name="h_g")
    h_l = Dense(128, activation="relu", name="h_l")
    linear_h_g = Dense(256, activation="linear", name="linear_h_g")
    linear_h_l = Dense(256, activation="linear", name="linear_h_l")

    locations = []
    loc_means = []
    with tf.variable_scope("RNN"):
        for time_step in range(num_steps):
            if time_step > 0:
                tf.get_variable_scope().reuse_variables()

            h_tm1 = state.h

            loc_mean = location_net(h_tm1)
            tf.histogram_summary("loc_mean(t=%d) without tanh" % time_step, loc_mean)
            # loc_mean = 1.7159 * tf.nn.tanh(2/3 * loc_mean)
            # tf.histogram_summary("loc_mean(t=%d)" % time_step, loc_mean)
            locations += [tf.stop_gradient(tf.random_normal((batch_size, 2), loc_mean, location_sigma))]
            loc_means += [loc_mean]

            sizes = [(glimpse_size[0] * (i + 1), glimpse_size[1] * (i + 1))
                     for i in range(3)]
            glimpses = take_glimpses(image, locations[-1], sizes)
            glimpse = tf.concat(3, glimpses)
            glimpse = tf.reshape(glimpse, (-1, np.prod(glimpse_size) * len(sizes)))

            _h_g = h_g(glimpse)
            _h_l = h_l(locations[-1])
            inputs = tf.nn.relu(linear_h_g(_h_g) + linear_h_l(_h_l))
            (cell_output, state) = cell(inputs, state)
            tf.image_summary("12x12 glimpse t=%d" % time_step, glimpses[-1], max_images=5)

    logits = Dense(num_classes, name="logits")(state.h)
    inference = tf.nn.softmax(logits)
    prediction = tf.arg_max(inference, 1)
    R = tf.cast(tf.equal(prediction, tf.arg_max(label, 1)), tf.float32)
    R = tf.stop_gradient(tf.expand_dims(R, 1))

    accuracy = tf.reduce_mean(R)
    tf.scalar_summary("accuracy", accuracy)

    loss = tf.nn.softmax_cross_entropy_with_logits(logits, tf.cast(label, tf.float32))
    loss = tf.reduce_mean(loss)
    tf.scalar_summary("xentropy", loss)

    b = K.variable(0., name="baseline")
    tf.scalar_summary("baseline", b)

    reinforce_loss = 0.
    for time_step, (l, l_mean) in enumerate(zip(locations, loc_means)):
        b_val = 0.
        if args.baseline:
            b_val = tf.stop_gradient(b)

        p = 1. / tf.sqrt(2 * np.pi * tf.square(location_sigma))
        p *= tf.exp(-tf.square(l - l_mean) / (2 * tf.square(location_sigma)))
        reinforce_loss -= alpha * (R - b_val) * tf.log(p + K.epsilon())

    baseline_loss = tf.squared_difference(tf.reduce_mean(R), b)
    tf.scalar_summary("loss:baseline", baseline_loss)

    reinforce_loss = tf.reduce_sum(tf.reduce_mean(reinforce_loss, reduction_indices=0))
    tf.scalar_summary("loss:reinforce", reinforce_loss)

    total_loss = loss + reinforce_loss + baseline_loss
    tf.scalar_summary("loss:total", total_loss)

    if str.lower(args.optimizer) == "adam":
        optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
    elif str.lower(args.optimizer) == "momentum":
        optimizer = tf.train.MomentumOptimizer(learning_rate=args.learning_rate, momentum=args.momentum)

    tvars = tf.trainable_variables()
    grads = tf.gradients(total_loss, tvars)
    for tvar, grad in zip(tvars, grads):
        tf.histogram_summary(tvar.name, grad)
    train_step = optimizer.apply_gradients(zip(grads, tvars))

    merged = tf.merge_all_summaries()
    summary_writer = tf.train.SummaryWriter(args.logdir, sess.graph)

    # Training
    sess.run(tf.initialize_all_variables())
    initial_c, initial_h = sess.run([initial_state.c, initial_state.h],
                                    feed_dict={image: np.zeros((batch_size, image_rows, image_cols, 1))})

    saver = tf.train.Saver()
    if args.train == 1:
        epoch_loss = []
        epoch_reinforce_loss = []
        epoch_acc = []

        global_step = 0
        while mnist.train.epochs_completed < num_epochs:
            current_epoch = mnist.train.epochs_completed
            batch_x, batch_y = mnist.train.next_batch(batch_size)
            batch_x = translate(batch_x.reshape((-1, 28, 28, 1)), size=(image_rows, image_cols))

            preds, loss, r_loss, summary, _ = sess.run([prediction, total_loss, reinforce_loss, merged, train_step],
                                                       feed_dict={image: batch_x, label: batch_y,
                                                                  initial_state.c: initial_c, initial_state.h: initial_h,
                                                                  K.learning_phase(): 1})
            epoch_loss += [loss]
            epoch_reinforce_loss += [r_loss]
            epoch_acc += [accuracy_score(preds, np.argmax(batch_y, axis=1))]

            summary_writer.add_summary(summary, global_step)
            global_step += 1

            if mnist.train.epochs_completed != current_epoch:
                print("[Epoch %d/%d]" % (current_epoch + 1, num_epochs))
                print("loss:", np.asarray(epoch_loss).mean())
                print("reinforce_loss: %.5f+/-%.5f" % (
                      np.asarray(epoch_reinforce_loss).mean(),
                      np.asarray(epoch_reinforce_loss).std()))
                print("acc: ", np.asarray(epoch_acc).mean())

                epoch_acc = []
                epoch_loss = []
                epoch_reinforce_loss = []

                val_loss = []
                val_reinforce_loss = []
                val_acc = []
                while mnist.validation.epochs_completed != 1:
                    batch_x, batch_y = mnist.validation.next_batch(batch_size)
                    batch_x = translate(batch_x.reshape((-1, 28, 28, 1)), size=(image_rows, image_cols))
                    res = sess.run([prediction, total_loss, reinforce_loss] + locations,
                                   feed_dict={image: batch_x.reshape((-1, image_rows, image_cols, 1)),
                                              label: batch_y,
                                              initial_state.c: initial_c, initial_state.h: initial_h,
                                              K.learning_phase(): 0})
                    preds, loss, r_loss = res[:3]
                    locs = res[3:]
                    val_loss += [loss]
                    val_reinforce_loss += [r_loss]
                    val_acc += [accuracy_score(preds, np.argmax(batch_y, axis=1))]

                    images = batch_x.reshape((-1, image_rows, image_cols))
                    locs = np.asarray(locs, dtype=np.float32)
                    locs = (locs + 1) * (image_rows / 2)
                    plot_glimpse(images, locs, name=args.logdir + "/glimpse.png")
                mnist.validation._epochs_completed = 0
                mnist.validation._index_in_epoch = 0

                print("Val loss:", np.asarray(val_loss).mean())
                print("Val reinforce_loss: %.5f+/-%.5f" % (
                      np.asarray(val_reinforce_loss).mean(),
                      np.asarray(val_reinforce_loss).std()))
                print("Val acc: ", np.asarray(val_acc).mean())
        saver.save(sess, args.checkpoint)

    if len(args.checkpoint) > 0:
        saver.restore(sess, args.checkpoint)

    # plot results
    batch_x, _ = mnist.train.next_batch(batch_size)
    batch_x = translate(batch_x.reshape((-1, 28, 28, 1)), size=(image_rows, image_cols))

    locs = sess.run(locations, feed_dict={image: batch_x.reshape((-1, image_rows, image_cols, 1)),
                                          initial_state.c: initial_c, initial_state.h: initial_h,
                                          K.learning_phase(): 0})

    images = batch_x.reshape((-1, image_rows, image_cols))
    locs = np.asarray(locs, dtype=np.float32)
    locs = (locs + 1) * (image_rows / 2)
    plot_glimpse(images, locs)
 def acc(self, y, p):
     return accuracy_score(np.argmax(y, axis=1), np.argmax(p, axis=1))