def ft_random_forest_testing(x_train, y_train, x_test, y_test): print('Random Forest Feature Loop\n\n') train_list = [] test_list = [] F1_list = [] for i in [1, 2, 5, 8, 10, 20, 25, 35, 50]: rclf = RandomForestClassifier(max_depth=7, max_features=i, n_trees=50) rclf.fit(x_train, y_train) preds_train = rclf.predict(x_train) preds_test = rclf.predict(x_test) train_accuracy = accuracy_score(preds_train, y_train) test_accuracy = accuracy_score(preds_test, y_test) print('Train {}'.format(train_accuracy)) print('Test {}'.format(test_accuracy)) preds = rclf.predict(x_test) print('F1 Test {}'.format(f1(y_test, preds))) # Grab the useful number per cycle train_list.append(train_accuracy) test_list.append(test_accuracy) F1_list.append(f1(y_test, preds)) plt.rcParams['font.family'] = ['serif'] x = [1, 2, 5, 8, 10, 20, 25, 35, 50] ax = plt.subplot(111) ax.plot(x, train_list, label='training') ax.plot(x, test_list, label='testing') ax.plot(x, F1_list, label='F1') plt.xlabel("max_features") plt.xticks(x) plt.ylabel("Accuracies") ax.legend() plt.savefig("RandomForestFeatures.png") plt.clf()
def test_adaboost(self): train_X,train_y,test_X,test_y = loadHorseColic() adaboost = AdaBoostClassifier() adaboost.fit(train_X,train_y) preds = adaboost.predict(test_X) print(accuracy_score(preds,test_y)) assert accuracy_score(preds,test_y)>0.7
def run_binary(): print('Performing binary classification on synthetic data') X_train, X_test, y_train, y_test = toy_data_binary() logiRegr = LogisticRegressionWithL2(alpha=.1) logiRegr.binary_train(X_train, y_train) train_preds = logiRegr.binary_predict(X_train) preds = logiRegr.binary_predict(X_test) print( 'train acc: %f, test acc: %f' % (accuracy_score(y_train, train_preds), accuracy_score(y_test, preds))) print('Performing binary classification on binarized MNIST') X_train, X_test, y_train, y_test = data_loader_mnist() logiRegr = LogisticRegressionWithL2(alpha=.1) binarized_y_train = [0 if yi < 5 else 1 for yi in y_train] binarized_y_test = [0 if yi < 5 else 1 for yi in y_test] logiRegr.binary_train(X_train, binarized_y_train) train_preds = logiRegr.binary_predict(X_train) preds = logiRegr.binary_predict(X_test) print('train acc: %f, test acc: %f' % (accuracy_score(binarized_y_train, train_preds), accuracy_score(binarized_y_test, preds)))
def decision_tree_various_depth(x_train, y_train, x_test, y_test): print('Decision Tree with depths 1-25 (inclusive)\n') # these will keep our points graphTrain = [] graphTest = [] graphF1 = [] # perform decision tree testing for each depth # i'd like to use the decision_tree_testing function here, but we need to set the proper depth for each iteration for layer in range(1, 26): print('Current depth: ', layer) clf = DecisionTreeClassifier(max_depth=layer) clf.fit(x_train, y_train) preds_train = clf.predict(x_train) preds_test = clf.predict(x_test) graphTrain.append(accuracy_score(preds_train, y_train)) graphTest.append(accuracy_score(preds_test, y_test)) print('Train {}'.format(accuracy_score(preds_train, y_train))) print('Test {}'.format(accuracy_score(preds_test, y_test))) preds = clf.predict(x_test) print('F1 Test {}\n'.format(f1(y_test, preds))) graphF1.append(f1(y_test, preds)) table = pd.DataFrame({ "Max Depth": [item for item in range(1, 26)], "Train Accuracy": graphTrain, "Test Accuracy": graphTest, "F1 Accuracy": graphF1 }) print(table) # plot our graph and output to a file plt.xlabel('Depth') plt.ylabel('Performance') plt.title('Accuracy & F1 Score vs Number of Trees') plt.plot('Max Depth', 'Train Accuracy', data=table, color='blue') plt.plot('Max Depth', 'Test Accuracy', data=table, color='green') plt.plot('Max Depth', 'F1 Accuracy', data=table, color='red') plt.legend() plt.savefig('q1.png') # get best depth in terms of validation accuracy topAccuracy = max(graphF1) print("The depth that gives the best validation accuracy is: ", [item for item in range(1, 26)][graphF1.index(topAccuracy)], "which has an F1 accuracy of ", topAccuracy) # get the most important feature for making a prediction clfMVP = DecisionTreeClassifier( max_depth=[item for item in range(1, 26)][graphF1.index(topAccuracy)]) clfMVP.fit(x_train, y_train) print("The most important feature for making a prediction is: ", clfMVP.root.feature) print("The threshold to split on for this feature is: ", clfMVP.root.split) # return the most important feature for use in main return clfMVP.root.feature
def random_forest_various_features(x_train, y_train, x_test, y_test): # keep our values to use for max_features useFeatures = [1, 2, 5, 8, 10, 20, 25, 35, 50] # for whatever reason, same variable names cause issues despite being within local scope # so we have to make sure there are no matching variable names even between functions graphTrain2 = [] graphTest2 = [] graphF12 = [] # let the user know which test this is print("== Beginning test for various max_features.\n") for features in useFeatures: print("max_features: ", features) rclf = RandomForestClassifier(max_depth=7, max_features=features, n_trees=50) rclf.fit(x_train, y_train) preds_train = rclf.predict(x_train) preds_test = rclf.predict(x_test) graphTrain2.append(accuracy_score(preds_train, y_train)) graphTest2.append(accuracy_score(preds_test, y_test)) print('Train {}'.format(accuracy_score(preds_train, y_train))) print('Test {}'.format(accuracy_score(preds_test, y_test))) preds = rclf.predict(x_test) graphF12.append(f1(y_test, preds)) print('F1 Test {}\n'.format(f1(y_test, preds))) # print lengths for debugging print("== Length of Train", len(graphTrain2)) print("== Length of Test", len(graphTest2)) print("== Length of F1", len(graphF12)) # table for easily reading data table2 = pd.DataFrame({ "max_features": [i for i in useFeatures], "Train Accuracy": graphTrain2, "Test Accuracy": graphTest2, "F1 Accuracy": graphF12 }) print(table2) # plot our graph and output to a file plt.figure(3) plt.xlabel('Max Features') plt.ylabel('Performance') plt.title('Accuracy & F1 Score vs Max Features') plt.plot('max_features', 'Train Accuracy', data=table2, color='blue') plt.plot('max_features', 'Test Accuracy', data=table2, color='green') plt.plot('max_features', 'F1 Accuracy', data=table2, color='red') plt.legend() plt.savefig('q2pd.png') # return best value for max_features to use in main return [feature for feature in useFeatures][graphF12.index(max(graphF12))]
def test_knn(self): iris = load_iris() data = iris['data'] target = iris['target'] knn = KNN(k=3, tree='kdtree', distance='euclidean') knn.fit(data, array(mat(target).T)) preds = knn.predict(data) print(accuracy_score(preds, target)) assert accuracy_score(preds, target) > 0.9
def random_forest_testing(x_train, y_train, x_test, y_test): print('Random Forest\n\n') rclf = RandomForestClassifier(max_depth=7, max_features=11, n_trees=50) rclf.fit(x_train, y_train) preds_train = rclf.predict(x_train) preds_test = rclf.predict(x_test) train_accuracy = accuracy_score(preds_train, y_train) test_accuracy = accuracy_score(preds_test, y_test) print('Train {}'.format(train_accuracy)) print('Test {}'.format(test_accuracy)) preds = rclf.predict(x_test) print('F1 Test {}'.format(f1(y_test, preds)))
def test_linear_regression(self): lr = LinearRegression(learning_rate=1e-6, max_iter=1000, threshold=1e-4) train_X, train_y, test_X, test_y = split_train_test(data, labels, scale=0.7, is_random=True) lr.fit(train_X, train_y) preds = lr.predict(test_X) print(accuracy_score(preds, test_y)) assert accuracy_score(preds, test_y) > 0.8
def decision_tree_testing(x_train, y_train, x_test, y_test): print('Decision Tree\n\n') clf = DecisionTreeClassifier(max_depth=20) clf.fit(x_train, y_train) preds_train = clf.predict(x_train) preds_test = clf.predict(x_test) train_accuracy = accuracy_score(preds_train, y_train) test_accuracy = accuracy_score(preds_test, y_test) print('Train {}'.format(train_accuracy)) print('Test {}'.format(test_accuracy)) preds = clf.predict(x_test) print('F1 Test {}'.format(f1(y_test, preds)))
def create_trees(x_train, y_train, x_test, y_test, maxdepth): #print('Decision Tree\n\n') clf = DecisionTreeClassifier(max_depth=maxdepth) clf.fit(x_train, y_train) preds_train = clf.predict(x_train) preds_test = clf.predict(x_test) train_accuracy = accuracy_score(preds_train, y_train) test_accuracy = accuracy_score(preds_test, y_test) #print('Train {}'.format(train_accuracy)) #print('Test {}'.format(test_accuracy)) preds = clf.predict(x_test) #print('F1 Test {}'.format(f1(y_test, preds))) return (f1(y_test, preds)), train_accuracy, test_accuracy
def adaboost_testing(x_train, y_train, x_test, y_test, M): print("Adaboost Tree\n\n") aclf = AdaBoostClassifier(max_depth = 1) aclf.fit(x_train, y_train, M) preds_train = aclf.predict(x_train) preds_test = aclf.predict(x_test) train_accuracy = accuracy_score(preds_train, y_train) test_accuracy = accuracy_score(preds_test, y_test) print('Train {}'.format(train_accuracy)) print('Test {}'.format(test_accuracy)) preds = aclf.predict(x_test) print('F1 Test {}'.format(f1(y_test, preds))) preds_train = aclf.predict(x_train) return train_accuracy, test_accuracy, f1(y_train, preds_train), f1(y_test, preds)
def run_random_forest(data, target_column): st.sidebar.title('Choose parameters for Random Forest') ts = st.sidebar.slider('Training size', min_value=0.0, max_value=1.0, step=0.01, value=0.7) n_estimators = st.sidebar.number_input('n_estimators', min_value=1, max_value=1000, step=1) n_features = st.sidebar.number_input('n_features', min_value=1, max_value=len(data.columns)-1, step=1, value=len(data.columns)-1) bootstrap_size = st.sidebar.number_input('bootstrap_size', min_value=1, max_value=int(len(data)*ts), step=1, value=int(len(data)*ts)) if st.sidebar.checkbox('Specify Depth'): max_depth = st.sidebar.number_input('max_depth', min_value=1, max_value=int(len(data)*ts), step=1) else: max_depth = None run_status = st.sidebar.button('Run Algorithm') if run_status: with st.spinner('Running...'): x_train, x_test, y_train, y_test = train_test_split(data.drop([target_column], axis=1), data[target_column], test_size=1 - ts) clf = RandomForest(n_estimators=n_estimators, n_features=n_features, max_depth=max_depth, bootstrap_size=bootstrap_size) clf.fit(x_train, y_train) """ ## :dart: Accuracy """ st.subheader(accuracy_score(y_test, clf.predict(x_test)))
def test_randomforest_classifier(self): rf = RandomForestClassifier(n_estimators=30,sample_scale=0.67,feature_scale=0.6) rf.fit(mat(data),target) preds = rf.predict(mat(data)) assert accuracy_score(preds,target)>0.95
def forwardPropagation(input, weights, bias, originalOutput, binarizedTruePrediction, prediction, numberOfSamples, numberOfNeuronsInLayers, classes, optimizer): #computing output of first hidden layer h1In = numpy.dot(input[:, :3], weights[0]) + numpy.repeat( numpy.array([bias[0]]), repeats=[numberOfSamples], axis=0) h1Output = utils.relu(h1In) #computing output of second hidden layer h2In = numpy.dot(h1Output, weights[1]) + numpy.repeat( numpy.array([bias[1]]), repeats=[numberOfSamples], axis=0) h2Output = utils.relu(h2In) #computing output of the output layer OIn = numpy.dot(h2Output, weights[2]) + numpy.repeat( numpy.array([bias[2]]), repeats=[numberOfSamples], axis=0) OOutput = utils.softmax(OIn) myPredictedValueListAsIntegers = numpy.argmax(OOutput, axis=1) # Computing overall error only for plotting graph if prediction == False: errorForGraph = utils.log_loss(binarizedTruePrediction, OOutput[:] + 0.00001) errorForPlottingGraphList.append(errorForGraph) accuracyScoreForGraph.append( utils.accuracy_score(originalOutput, myPredictedValueListAsIntegers)) backPropagation(input, OOutput, originalOutput, binarizedTruePrediction, h1Output, h2Output, numberOfNeuronsInLayers, classes, h1In, h2In, OIn, optimizer) else: return myPredictedValueListAsIntegers
def main(): data = datasets.load_digits() X = normalize(data.data) y = data.target # One-hot encoding of nominal y-values y = to_categorical(y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, seed=8) # Perceptron clf = Perceptron(n_iterations=5000, learning_rate=0.001, loss=CrossEntropy, activation_function=Sigmoid) clf.fit(X_train, y_train) y_pred = np.argmax(clf.predict(X_test), axis=1) y_test = np.argmax(y_test, axis=1) accuracy = accuracy_score(y_test, y_pred) print("Accuracy:", accuracy) # Reduce dimension to two using PCA and plot the results Plot().plot_in_2d(X_test, y_pred, title="Perceptron", accuracy=accuracy, legend_labels=np.unique(y))
def main(): data = datasets.load_digits() X = data.data y = data.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, seed=2) print("X_train.shape:", X_train.shape) print("Y_train.shape:", y_train.shape) clf = RandomForest(n_estimators=100) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print("Accuracy:", accuracy) Plot().plot_in_2d(X_test, y_pred, title="Random Forest", accuracy=accuracy, legend_labels=data.target_names)
def main(): data = datasets.load_digits() X = normalize(data.data) y = data.target # convert the nominal y values to binary y = to_categorical(y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) #mlp clf = MultilayerPerceptron(n_hidden=16, n_iterations=1000, learning_rate=0.01) clf.fit(X_train, y_train) y_pred = np.argmax(clf.predict(X_test), axis=1) y_test = np.argmax(y_test, axis=1) accuracy = accuracy_score(y_test, y_pred) print("Accuracy:", accuracy) # Reduce dimension to two using PCA and plot the results Plot().plot_in_2d(X_test, y_pred, title="Multilayer Perceptron", accuracy=accuracy, legend_labels=np.unique(y))
def main(): # Load dataset data = datasets.load_iris() X = normalize(data.data[data.target != 0]) y = data.target[data.target != 0] y[y == 1] = 0 y[y == 2] = 1 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, seed=1) clf = LogisticRegression() clf.fit(X_train, y_train) y_pred = clf.predict(X_test) y_pred = np.reshape(y_pred, y_test.shape) accuracy = accuracy_score(y_test, y_pred) print("Accuracy:", accuracy) # Reduce dimension to two using PCA and plot the results Plot().plot_in_2d(X_test, y_pred, title="Logistic Regression", accuracy=accuracy, legend_labels=data.target_names)
def main(): df = pd.read_csv('fishiris.csv') df['target'] = df.apply(create_target, axis=1) y = df['target'].to_numpy() df = df.drop(['Name', 'target'], axis=1) feature_names = df.columns.tolist() X = df.to_numpy() target_names = ['setosa', 'versicolor', 'virginica'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, shuffle=True) print('X_train\n', X_train) print('y_train\n', y_train) print('X_test\n', X_test) print('y_test\n', y_test) clf = ClassificationTree() clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print('-' * 40, 'print_tree', '-' * 40) clf.print_tree(feature_names=feature_names) print('-' * 40, 'print_tree', '-' * 40) accuracy = accuracy_score(y_test, y_pred) print("Accuracy:", accuracy) Plot().plot_in_2d(X_test, y_pred, title="Decision Tree", accuracy=accuracy, legend_labels=target_names) Plot().plot_in_3d(X_test, y_pred)
def ada_boost_testing(x_train, y_train, x_test, y_test, num_learner=50): print('Ada Boost') print(x_train, y_train) aba = AdaBoostClassifier(num_learner) aba.fit(x_train, y_train) preds_train = aba.predict(x_train) preds_test = aba.predict(x_test) print(preds_train, preds_test) train_accuracy = accuracy_score(preds_train, y_train) test_accuracy = accuracy_score(preds_test, y_test) print('Train {}'.format(train_accuracy)) print('Test {}'.format(test_accuracy)) preds = aba.predict(x_test) print('F1 Test {}'.format(f1(y_test, preds)))
def random_forest_various_seeds(x_train, y_train, x_test, y_test, best_max_features, best_n_trees): # let the user know which test this is print("== Beginning test for best result with random seeds.\n") # to hold data points randseedTrain = [] randseedTest = [] randseedF1 = [] averageSeeds = [] averageTrain = [] averageTest = [] averageF1 = [] usedSeeds = [] rclf = RandomForestClassifier(max_depth=7, max_features=best_max_features, n_trees=best_n_trees) for item in [i for i in range(10)]: rclf.seed = np.random.randint(1, 1000) usedSeeds.append(rclf.seed) rclf.fit(x_train, y_train) preds_train = rclf.predict(x_train) preds_test = rclf.predict(x_test) randseedTrain.append(accuracy_score(preds_train, y_train)) randseedTest.append(accuracy_score(preds_test, y_test)) print('Train {}'.format(accuracy_score(preds_train, y_train))) print('Test {}'.format(accuracy_score(preds_test, y_test))) preds = rclf.predict(x_test) randseedF1.append(f1(y_test, preds)) print('F1 Test {}\n'.format(f1(y_test, preds))) # get averages averageSeeds.append("Average") averageTrain.append(sum(randseedTrain) / len(randseedTrain)) averageTest.append(sum(randseedTest) / len(randseedTest)) averageF1.append(sum(randseedF1) / len(randseedF1)) # get table for data + add averages at the end table3 = pd.DataFrame({ "Seed": [i for i in usedSeeds] + averageSeeds, "Train Accuracy": randseedTrain + averageTrain, "Test Accuracy": randseedTest + averageTest, "F1 Score": randseedF1 + averageF1 }) print(table3)
def random_forest_tune_MaxFeatures(x_train, y_train, x_test, y_test): print('Random Forest tune\n\n') plotX = [1, 2, 5, 8, 10, 20, 25, 35, 50] plotTrain = [] plotTest = [] plotF1 = [] for max_features in plotX: print("MAX_Features: ", max_features) rclf = RandomForestClassifier(max_depth=7, max_features=max_features, n_trees=50) rclf.fit(x_train, y_train) preds_train = rclf.predict(x_train) preds_test = rclf.predict(x_test) train_accuracy = round(accuracy_score(preds_train, y_train), 3) test_accuracy = round(accuracy_score(preds_test, y_test), 3) print('Train {}'.format(train_accuracy)) print('Test {}'.format(test_accuracy)) preds = rclf.predict(x_test) F1 = round(f1(y_test, preds), 3) print('F1 Test {}'.format(F1)) print('\n') plotTrain.append(train_accuracy) plotTest.append(test_accuracy) plotF1.append(F1) df = pd.DataFrame({ "MAX_Features": plotX, "Train_Accuracy": plotTrain, "Test_Accuracy": plotTest, "F1_Accuracy": plotF1 }) print(df) maxAccuracy = max(plotF1) best_MAX_Features = plotX[plotF1.index(maxAccuracy)] print("The best MAX_Features is ", best_MAX_Features, "with F1 accuracy ", maxAccuracy) print("Drawing plot") plt.plot('MAX_Features', 'Train_Accuracy', data=df, color='red') plt.plot('MAX_Features', 'Test_Accuracy', data=df, color='blue') plt.plot('MAX_Features', 'F1_Accuracy', data=df, color='black') plt.legend() plt.savefig('random_forest_output_max_features.png') plt.close() return best_MAX_Features
def ada_boost_testing(x_train, y_train, x_test, y_test, num_learner): print('Ada Boost and L(', num_learner, ')') aba = AdaBoostClassifier(num_learner) aba.fit(x_train, y_train) preds_train = aba.predict(x_train) preds_test = aba.predict(x_test) train_accuracy = accuracy_score(preds_train, y_train) test_accuracy = accuracy_score(preds_test, y_test) print('Train {}'.format(train_accuracy)) print('Test {}'.format(test_accuracy)) preds = aba.predict(x_test) preds_train = aba.predict(x_train) print('F1 Train {}'.format(f1(y_train, preds_train))) print('F1 Test {}\n'.format(f1(y_test, preds))) return train_accuracy, test_accuracy, f1(y_train, preds_train), f1(y_test, preds)
def adaBoost_testing(x_train, y_train, x_test, y_test, l): clf = AdaBoostClassifier(l) clf.fit(x_train, y_train) preds_train = clf.predict(x_train) preds_test = clf.predict(x_test) train_accuracy = round(accuracy_score(preds_train, y_train), 3) test_accuracy = round(accuracy_score(preds_test, y_test), 3) print("L: ", l) print('Train {}'.format(train_accuracy)) print('Test {}'.format(test_accuracy)) preds = clf.predict(x_test) F1 = round(f1(y_test, preds), 3) print('F1 Test {}'.format(F1)) print() return train_accuracy, test_accuracy, F1
def test_nb_using_iris(self): iris = load_iris() data = iris['data'] target = iris['target'] nb = NaiveBayes() nb.fit(data, target) preds = nb.predict(data) assert accuracy_score(preds, target) > 0.9
def test_id3(self): id3 = ID3(max_depth=100, max_leafs=100, epsilon=0.00001) dataset, labels, feat_labels = load_loans() id3.set_feature_labels(feat_labels) id3.fit(dataset, labels) preds = id3.predict(dataset) score = accuracy_score(preds, labels) assert score > 0.99
def random_forest_various_trees(x_train, y_train, x_test, y_test): graphTrain = [] graphTest = [] graphF1 = [] # let the user know which test this is print("== Beginning test for various n_trees.\n") # plot accuracies for the number of trees specified in part b for i in range(10, 210, 10): print("n_trees: ", i) rclf = RandomForestClassifier(max_depth=7, max_features=11, n_trees=i) rclf.fit(x_train, y_train) preds_train = rclf.predict(x_train) preds_test = rclf.predict(x_test) graphTrain.append(accuracy_score(preds_train, y_train)) graphTest.append(accuracy_score(preds_test, y_test)) print('Train {}'.format(accuracy_score(preds_train, y_train))) print('Test {}'.format(accuracy_score(preds_test, y_test))) preds = rclf.predict(x_test) print('F1 Test {}\n'.format(f1(y_test, preds))) graphF1.append(f1(y_test, preds)) # table for easily reading data table = pd.DataFrame({ "n_trees": [i for i in range(10, 210, 10)], "Train Accuracy": graphTrain, "Test Accuracy": graphTest, "F1 Accuracy": graphF1 }) print(table) # plot our graph and output to a file plt.figure(2) plt.xlabel('Number of trees') plt.ylabel('Performance') plt.title('Accuracy & F1 Score vs Number of Trees in the Forest') plt.plot('n_trees', 'Train Accuracy', data=table, color='blue') plt.plot('n_trees', 'Test Accuracy', data=table, color='green') plt.plot('n_trees', 'F1 Accuracy', data=table, color='red') plt.legend() plt.savefig('q2pb.png') # return our best n__trees value for use in main return [i for i in range(10, 210, 10)][graphF1.index(max(graphF1))]
def random_forest_testing(x_train, y_train, x_test, y_test, n_trees, max_features): print('Random Forest') print("max_depth: %d, max_features: %d, n_trees: %d" % (7,max_features, n_trees)) rclf = RandomForestClassifier(n_trees, max_features, max_depth=7) rclf.fit(x_train, y_train) preds_train = rclf.predict(x_train) preds_test = rclf.predict(x_test) train_accuracy = accuracy_score(preds_train, y_train) test_accuracy = accuracy_score(preds_test, y_test) print('Train {}'.format(train_accuracy)) print('Test {}'.format(test_accuracy)) preds = rclf.predict(x_test) preds_train = rclf.predict(x_train) print('F1 Train {}'.format(f1(y_train, preds_train))) print('F1 Test {}\n'.format(f1(y_test, preds))) return train_accuracy, test_accuracy, f1(y_train, preds_train), f1(y_test, preds)
def decision_tree_tune(x_train, y_train, x_test, y_test): print('Decision Tree tune\n\n') plotX = [i for i in range(1, 26)] plotTrain = [] plotTest = [] plotF1 = [] for depth in range(1, 26): print('Math Depth: ', depth) clf = DecisionTreeClassifier(max_depth=depth) clf.fit(x_train, y_train) preds_train = clf.predict(x_train) preds_test = clf.predict(x_test) train_accuracy = round(accuracy_score(preds_train, y_train), 3) test_accuracy = round(accuracy_score(preds_test, y_test), 3) print('Train {}'.format(train_accuracy)) print('Test {}'.format(test_accuracy)) preds = clf.predict(x_test) F1 = round(f1(y_test, preds), 3) print('F1 Test {}'.format(F1)) print('\n') plotTrain.append(train_accuracy) plotTest.append(test_accuracy) plotF1.append(F1) df = pd.DataFrame({ "Max_Depth": plotX, "Train_Accuracy": plotTrain, "Test_Accuracy": plotTest, "F1_Accuracy": plotF1 }) print(df) maxAccuracy = max(plotF1) bestDepth = plotX[plotF1.index(maxAccuracy)] print("The best Depth is ", bestDepth, "with F1 accuracy ", maxAccuracy) print("Drawing plot") plt.plot('Max_Depth', 'Train_Accuracy', data=df, color='red') plt.plot('Max_Depth', 'Test_Accuracy', data=df, color='blue') plt.plot('Max_Depth', 'F1_Accuracy', data=df, color='black') plt.legend() plt.savefig('decision_tree_output.png') plt.close() return bestDepth
def decision_tree_testing(x_train, y_train, x_test, y_test, max_depth): print('Decision Tree') print("depth : %d" % max_depth) clf = DecisionTreeClassifier(max_depth) clf.fit(x_train, y_train) preds_train = clf.predict(x_train) preds_test = clf.predict(x_test) train_accuracy = accuracy_score(preds_train, y_train) test_accuracy = accuracy_score(preds_test, y_test) print('Train {}'.format(train_accuracy)) print('Test {}'.format(test_accuracy)) preds = clf.predict(x_test) preds_train =clf.predict(x_train) print('F1 Train {}'.format(f1(y_train, preds_train))) print('F1 Test {}\n'.format(f1(y_test, preds))) return train_accuracy, test_accuracy, f1(y_train, preds_train), f1(y_test, preds)
def main(): data = datasets.load_digits() X = normalize(data.data) y = data.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) print("X_train",X_train.shape) clf = NaiveBayes() clf.fit(X_train, y_train) y_pred = clf.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print ("Accuracy:", accuracy) # Reduce dimension to two using PCA and plot the results Plot().plot_in_2d(X_test, y_pred, title="Naive Bayes", accuracy=accuracy, legend_labels=data.target_names)
def main(): # Load dataset data = datasets.load_iris() X = normalize(data.data[data.target != 0]) y = data.target[data.target != 0] y[y == 1] = 0 y[y == 2] = 1 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, seed=1) clf = LogisticRegression() clf.fit(X_train, y_train) y_pred = clf.predict(X_test) y_pred = np.reshape(y_pred, y_test.shape) accuracy = accuracy_score(y_test, y_pred) print("Accuracy:", accuracy) # Reduce dimension to two using PCA and plot the results Plot().plot_in_2d(X_test, y_pred, title="Logistic Regression", accuracy=accuracy)
def main(): print ("-- Classification Tree --") data = datasets.load_iris() X = data.data y = data.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) clf = ClassificationTree() clf.fit(X_train, y_train) y_pred = clf.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print ("Accuracy:", accuracy) Plot().plot_in_2d(X_test, y_pred, title="Decision Tree", accuracy=accuracy, legend_labels=data.target_names)
def main(): print ("-- Gradient Boosting Classification --") data = datasets.load_iris() X = data.data y = data.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) print(y_train) clf = GBDTClassifier() clf.fit(X_train, y_train) y_pred = clf.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print ("Accuracy:", accuracy) Plot().plot_in_2d(X_test, y_pred, title="Gradient Boosting", accuracy=accuracy, legend_labels=data.target_names)
def run(args): batch_size = args.batch_size num_epochs = args.num_epochs num_steps = args.num_time_steps num_classes = 10 num_lstm_units = args.num_lstm_units num_lstm_layer = 1 alpha = args.alpha location_sigma = args.location_sigma glimpse_size = (12, 12) image_rows, image_cols = [int(v) for v in args.image_size.split("x")] mnist = input_data.read_data_sets("data", one_hot=True) sess = tf.Session() K.set_session(sess) image = tf.placeholder(tf.float32, (None, image_rows, image_cols, 1)) label = tf.placeholder(tf.int32, (None, num_classes)) tf.image_summary("translated mnist", image, max_images=3) cell = tf.nn.rnn_cell.LSTMCell(num_lstm_units, forget_bias=1., use_peepholes=True, state_is_tuple=True) # cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * num_lstm_layer, state_is_tuple=True) state = initial_state = cell.zero_state(tf.shape(image)[0], dtype=tf.float32) location_net = Dense(2, activation="linear", name="location_net") h_g = Dense(128, activation="relu", name="h_g") h_l = Dense(128, activation="relu", name="h_l") linear_h_g = Dense(256, activation="linear", name="linear_h_g") linear_h_l = Dense(256, activation="linear", name="linear_h_l") locations = [] loc_means = [] with tf.variable_scope("RNN"): for time_step in range(num_steps): if time_step > 0: tf.get_variable_scope().reuse_variables() h_tm1 = state.h loc_mean = location_net(h_tm1) tf.histogram_summary("loc_mean(t=%d) without tanh" % time_step, loc_mean) # loc_mean = 1.7159 * tf.nn.tanh(2/3 * loc_mean) # tf.histogram_summary("loc_mean(t=%d)" % time_step, loc_mean) locations += [tf.stop_gradient(tf.random_normal((batch_size, 2), loc_mean, location_sigma))] loc_means += [loc_mean] sizes = [(glimpse_size[0] * (i + 1), glimpse_size[1] * (i + 1)) for i in range(3)] glimpses = take_glimpses(image, locations[-1], sizes) glimpse = tf.concat(3, glimpses) glimpse = tf.reshape(glimpse, (-1, np.prod(glimpse_size) * len(sizes))) _h_g = h_g(glimpse) _h_l = h_l(locations[-1]) inputs = tf.nn.relu(linear_h_g(_h_g) + linear_h_l(_h_l)) (cell_output, state) = cell(inputs, state) tf.image_summary("12x12 glimpse t=%d" % time_step, glimpses[-1], max_images=5) logits = Dense(num_classes, name="logits")(state.h) inference = tf.nn.softmax(logits) prediction = tf.arg_max(inference, 1) R = tf.cast(tf.equal(prediction, tf.arg_max(label, 1)), tf.float32) R = tf.stop_gradient(tf.expand_dims(R, 1)) accuracy = tf.reduce_mean(R) tf.scalar_summary("accuracy", accuracy) loss = tf.nn.softmax_cross_entropy_with_logits(logits, tf.cast(label, tf.float32)) loss = tf.reduce_mean(loss) tf.scalar_summary("xentropy", loss) b = K.variable(0., name="baseline") tf.scalar_summary("baseline", b) reinforce_loss = 0. for time_step, (l, l_mean) in enumerate(zip(locations, loc_means)): b_val = 0. if args.baseline: b_val = tf.stop_gradient(b) p = 1. / tf.sqrt(2 * np.pi * tf.square(location_sigma)) p *= tf.exp(-tf.square(l - l_mean) / (2 * tf.square(location_sigma))) reinforce_loss -= alpha * (R - b_val) * tf.log(p + K.epsilon()) baseline_loss = tf.squared_difference(tf.reduce_mean(R), b) tf.scalar_summary("loss:baseline", baseline_loss) reinforce_loss = tf.reduce_sum(tf.reduce_mean(reinforce_loss, reduction_indices=0)) tf.scalar_summary("loss:reinforce", reinforce_loss) total_loss = loss + reinforce_loss + baseline_loss tf.scalar_summary("loss:total", total_loss) if str.lower(args.optimizer) == "adam": optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate) elif str.lower(args.optimizer) == "momentum": optimizer = tf.train.MomentumOptimizer(learning_rate=args.learning_rate, momentum=args.momentum) tvars = tf.trainable_variables() grads = tf.gradients(total_loss, tvars) for tvar, grad in zip(tvars, grads): tf.histogram_summary(tvar.name, grad) train_step = optimizer.apply_gradients(zip(grads, tvars)) merged = tf.merge_all_summaries() summary_writer = tf.train.SummaryWriter(args.logdir, sess.graph) # Training sess.run(tf.initialize_all_variables()) initial_c, initial_h = sess.run([initial_state.c, initial_state.h], feed_dict={image: np.zeros((batch_size, image_rows, image_cols, 1))}) saver = tf.train.Saver() if args.train == 1: epoch_loss = [] epoch_reinforce_loss = [] epoch_acc = [] global_step = 0 while mnist.train.epochs_completed < num_epochs: current_epoch = mnist.train.epochs_completed batch_x, batch_y = mnist.train.next_batch(batch_size) batch_x = translate(batch_x.reshape((-1, 28, 28, 1)), size=(image_rows, image_cols)) preds, loss, r_loss, summary, _ = sess.run([prediction, total_loss, reinforce_loss, merged, train_step], feed_dict={image: batch_x, label: batch_y, initial_state.c: initial_c, initial_state.h: initial_h, K.learning_phase(): 1}) epoch_loss += [loss] epoch_reinforce_loss += [r_loss] epoch_acc += [accuracy_score(preds, np.argmax(batch_y, axis=1))] summary_writer.add_summary(summary, global_step) global_step += 1 if mnist.train.epochs_completed != current_epoch: print("[Epoch %d/%d]" % (current_epoch + 1, num_epochs)) print("loss:", np.asarray(epoch_loss).mean()) print("reinforce_loss: %.5f+/-%.5f" % ( np.asarray(epoch_reinforce_loss).mean(), np.asarray(epoch_reinforce_loss).std())) print("acc: ", np.asarray(epoch_acc).mean()) epoch_acc = [] epoch_loss = [] epoch_reinforce_loss = [] val_loss = [] val_reinforce_loss = [] val_acc = [] while mnist.validation.epochs_completed != 1: batch_x, batch_y = mnist.validation.next_batch(batch_size) batch_x = translate(batch_x.reshape((-1, 28, 28, 1)), size=(image_rows, image_cols)) res = sess.run([prediction, total_loss, reinforce_loss] + locations, feed_dict={image: batch_x.reshape((-1, image_rows, image_cols, 1)), label: batch_y, initial_state.c: initial_c, initial_state.h: initial_h, K.learning_phase(): 0}) preds, loss, r_loss = res[:3] locs = res[3:] val_loss += [loss] val_reinforce_loss += [r_loss] val_acc += [accuracy_score(preds, np.argmax(batch_y, axis=1))] images = batch_x.reshape((-1, image_rows, image_cols)) locs = np.asarray(locs, dtype=np.float32) locs = (locs + 1) * (image_rows / 2) plot_glimpse(images, locs, name=args.logdir + "/glimpse.png") mnist.validation._epochs_completed = 0 mnist.validation._index_in_epoch = 0 print("Val loss:", np.asarray(val_loss).mean()) print("Val reinforce_loss: %.5f+/-%.5f" % ( np.asarray(val_reinforce_loss).mean(), np.asarray(val_reinforce_loss).std())) print("Val acc: ", np.asarray(val_acc).mean()) saver.save(sess, args.checkpoint) if len(args.checkpoint) > 0: saver.restore(sess, args.checkpoint) # plot results batch_x, _ = mnist.train.next_batch(batch_size) batch_x = translate(batch_x.reshape((-1, 28, 28, 1)), size=(image_rows, image_cols)) locs = sess.run(locations, feed_dict={image: batch_x.reshape((-1, image_rows, image_cols, 1)), initial_state.c: initial_c, initial_state.h: initial_h, K.learning_phase(): 0}) images = batch_x.reshape((-1, image_rows, image_cols)) locs = np.asarray(locs, dtype=np.float32) locs = (locs + 1) * (image_rows / 2) plot_glimpse(images, locs)
def acc(self, y, p): return accuracy_score(np.argmax(y, axis=1), np.argmax(p, axis=1))