def main(): X_train, Y_train, X_test, Y_test = knn.load_data() print("X_train[0]", X_train[0]) print(X_test) # Normalize, each feature will have a value between 0 and 1 new_dim = np.newaxis X_train = X_train / X_train.sum(axis=1)[:, new_dim] # count = 0 # for row in X_train: # for feature in row: # if float(feature) > 0.5: # count +=1 # print("Num train features greater than .5", count) X_test = X_test / X_test.sum(axis=1)[:, new_dim] print("X_train after normalize: ", X_train[0]) train_normal = np.append(X_train, Y_train, axis=1) print("train[0]: ", train_normal[0]) test_normal = np.append(X_test, Y_test, axis=1) print("test[0]: ", test_normal[0]) # # # Implement the K-nearest neighbor algorithm, where K is a parameter. # K = knn.gen_k_values() train_err = knn.accuracy(train_normal, train_normal, K) print(train_err) # Part 2 of Question 1: train_error = knn.accuracy(train_normal, train_normal, K) temp = [[] for _ in range(len(test_normal))] cross_error = [] for i, example in enumerate(train_normal): train_normal = np.delete(train_normal, i, 0) temp[i] = knn.accuracy(train_normal, example, K, leave_out=True) train_normal = np.insert(train_normal, i, example, 0) for total in np.sum(temp, axis=0): cross_error.append(np.float32(total) / (len(test_normal) -1)) print("!!!!!!Cross results", cross_error) test_error = knn.accuracy(train_normal, test_normal, K) print("!!!!!!training results", train_error) print("!!!!!!testing results", test_error) print(len(train_error), len(test_error)) if PLOT: plt.plot(K, train_error) plt.plot(K, test_error) print("len cross: ", len(cross_error)) plt.plot(K, cross_error) plt.xlabel('K values') plt.ylabel('Error') plt.show()
def knn_get_both_accuracies(k, xTrain, yTrain, xTest, yTest): model = knn.Knn(int(k)) model.train(xTrain, yTrain['label']) # predict the training dataset yHatTrain = model.predict(xTrain) trainAcc = knn.accuracy(yHatTrain, yTrain['label']) # predict the test dataset yHatTest = model.predict(xTest) testAcc = knn.accuracy(yHatTest, yTest['label']) return trainAcc, testAcc
def main(): """ Main file to run from the command line. """ # set up the program to take in arguments from the command line parser = argparse.ArgumentParser() parser.add_argument("--xTrain", default="q3xTrain.csv", help="filename for features of the training data") parser.add_argument( "--yTrain", default="q3yTrain.csv", help="filename for labels associated with training data") parser.add_argument("--xTest", default="q3xTest.csv", help="filename for features of the test data") parser.add_argument( "--yTest", default="q3yTest.csv", help="filename for labels associated with the test data") args = parser.parse_args() # load the train and test data xTrain = pd.read_csv(args.xTrain) yTrain = pd.read_csv(args.yTrain) xTest = pd.read_csv(args.xTest) yTest = pd.read_csv(args.yTest) # create an instance of the model perf = [] # the different versions of k to try for k in range(1, 20, 2): model = knn.Knn(k) model.train(xTrain, yTrain['label']) yHatTrain = model.predict(xTrain) trainAcc = knn.accuracy(yHatTrain, yTrain['label']) yHatTest = model.predict(xTest) testAcc = knn.accuracy(yHatTest, yTest['label']) perf.append([k, trainAcc, testAcc]) perfDF = pd.DataFrame(perf, columns=["k", "train", "test"]) print(perfDF) perfDF = perfDF.set_index("k") sns.set(style="whitegrid") # also do a plot snsPlot = sns.lineplot(data=perfDF, palette="tab10", linewidth=2.5) snsfigure = snsPlot.get_figure() snsfigure.savefig("q3d.png")
def knn_train_test(k, xTrain, yTrain, xTest, yTest): """ Given a specified k, train the knn model and predict the labels of the test data. Returns the accuracy of the resulting model. Parameters ---------- k : int The number of neighbors xTrain : nd-array with shape n x d Training data yTrain : 1d array with shape n Array of labels associated with training data. xTest : nd-array with shape m x d Test data yTest : 1d array with shape m Array of labels associated with test data. Returns ------- acc : float The accuracy of the trained knn model on the test data """ model = knn.Knn(k) model.train(xTrain, yTrain['label']) # predict the test dataset yHatTest = model.predict(xTest) return knn.accuracy(yHatTest, yTest['label'])
def test_accuracy(): test_dataframe = knn.testing test_data = knn.df_testing training_data = knn.df_data k = 3 assert knn.accuracy(test_dataframe, test_data, training_data, k) == '60.0%', "Accuracy calculated wrong" return
def cross_validate(features, labels): error = 0.0 for fold in range(10): training = np.ones(len(features), bool) training[fold::10] = 0 testing = ~training model = learn_model(1, features[training], labels[training]) test_error = accuracy(features[testing], labels[testing], model) error += test_error return error / 10.0
def cross_validation(features, labels): error = 0.0 for fold in range(10): training = np.ones(len(features), bool) training[fold::10] = 0 testing = ~training model = learn_model(1, features[training], labels[training]) test_error = accuracy(features[testing], labels[testing], model) error += test_error return error / 10.0
def oneFold(): # masks for training and testing training = np.ones(len(features), bool) # sample training[1::4] = 0 testing = ~training k = 1 model = fit_model(k, features[training], labels[training]) accr = accuracy(model, features[testing], labels[testing]) print 'Aprox Accuracy was{0:.1%}'.format(accr)
def cross_validate(features, labels): k = 1 accr = 0.0 nFolds = 10 for fold in range(nFolds): training = np.ones(len(features), bool) # unsample every nFold training[fold::nFolds] = 0 testing = ~training model = fit_model(k, features[training], labels[training]) accr += accuracy(model, features[testing], labels[testing]) return accr / nFolds
def cross_validate(train_data, train_labels, k, distance, F=5, prints=True): """ Performs f-fold cross validation on the specified training set. Returns an array storing all cross-validation accuracies """ # number of training instances in each cross-validation subset C = train_data.shape[0] // F # initialize empty array to store cross-validation accuracies accuracy = np.zeros(F) # for each round of cross-validation for f in range(F): # create indices for the validation set validation_index = np.arange(f * C, (f + 1) * C) # create indeices for the training set train_index = np.setdiff1d(np.arange(0, train_data.shape[0]), validation_index) # obtain predicted labels for the images in the validation set predicted_labels = knn.classify(train_data[train_index], train_labels[train_index], train_data[validation_index], k, distance) # compute confusion matrix for validation set con_matrix = knn.confusion_matrix(train_labels[validation_index], predicted_labels) # convert to pandas data frame to label rows and columns, then print # suppressed when performing cross-validation for multiple values of k if prints: con_mat_df = pd.DataFrame(con_matrix, index=['1', '2', '7'], columns=['1', '2', '7']) print('Cross-validation round', f + 1) print( 'Confusion matrix: Predicted classes along horizontal axis. Actual classes along vertical axis.' ) print(con_mat_df) # compute and store cross-validation accuracy accuracy[f] = knn.accuracy(con_matrix) return accuracy
f.close() elif model == 'nearest': file = open(input_file, "r") if train_or_test == 'train': train_data,train_names=knn.read_file(file) f = open(model_file, 'wb') pickle.dump(train_data, f, protocol=pickle.HIGHEST_PROTOCOL) f.close() else: f = open(model_file, 'rb') train_data = pickle.load(f) test_data,test_names=knn.read_file(file) knn.accuracy(knn.knn(knn.euclidean(train_data,test_data), train_data), test_names) f.close() elif model == 'forest': if train_or_test == 'train': forest.train(input_file, model_file) else: forest.test_forest(input_file, model_file)
Xtest,yTest,XtestID = myBoost.getDataFromFile(train_test_file) finalPredictions = myBoost.predict(Xtest) myBoost.writeToFile(XtestID,finalPredictions,'output.txt') print("Accuracy is: " ,sum(finalPredictions==yTest)/len(yTest)) else: print("Untrained model being tested") #train train-data.txt knn_model.txt knn #test test-data.txt knn_model.txt knn if model == 'knn' : if trainOrTest == 'train': knn.train(train_test_file,model_file) if trainOrTest == 'test': try: myKnn = open(model_file,'rb') except: print("output file has not been generated") finalPredictions,yTest,XtestID= knn.test(48,model_file ,train_test_file) knn.writeToFile(XtestID,finalPredictions,'output.txt') print("Accuracy is: " ,knn.accuracy(finalPredictions,yTest))
''' This is a script to run the classifier ''' from knn import KNN from knn import accuracy print("Iris flower predictions based on KNN:") model = KNN('./data/iris.data') predictions = model.run_classifier('./data/bezdekIris.data') print("Predictions for test data set:", predictions) accuracy = accuracy(predictions, './data/bezdekIris.data') print("model accuracy:", accuracy)
def totalAcuuracyGaussian1(): sum_average = 0.0 for i in range(10): sum_average += knn.accuracy(totalGaussianClassification(folds[i]), data[classes[i]])/10.0 return sum_average
def k_nearest_neighbor_model(algo_type): options = [ "Iris Flower Classification", "Wine Quality Classification", "Heart Disease Chances Prediction", "Other" ] ds = st.sidebar.selectbox( "Choose Dataset (Choose Other to Upload External File)", options) split = st.sidebar.text_input( "Enter test split percentage between 1 and 100 ( e.g. 30 or 40)") split_percent = None dataset = False status = None if split: try: split_percent = int(split) if split_percent < 1 or split_percent > 100: st.stop() except: st.sidebar.error( "SPLIT PERCENTAGE MUST BE AN INTEGER (BETWEEN 1 TO 100)") st.stop() if ds == "Iris Flower Classification" and split_percent: dataset = True status, X_train, Y_train, X_test, Y_test, last = LOAD.load_internal_csv_dataset( "Iris.csv", split_percent, "Classification") elif ds == "Wine Quality Classification" and split_percent: dataset = True status, X_train, Y_train, X_test, Y_test, last = LOAD.load_internal_csv_dataset( "wine.csv", split_percent, "Classification") elif ds == "Heart Disease Chances Prediction" and split_percent: dataset = True status, X_train, Y_train, X_test, Y_test, last = LOAD.load_internal_csv_dataset( "heart.csv", split_percent, "Classification") elif ds == "Other" and split_percent: st.sidebar.markdown( """<ul><li>File Format: CSV</li><li>Must be preprocessed</li> <li>Must not contain any NAN values</li> <li>Must not contain invalid values (Combination of numeric and non-numeric)</li> <li>Assumption: First n-1 columns are numeric and last column (output) can be either numeric or non-numeric </li>""", unsafe_allow_html=True) dataset = st.sidebar.file_uploader("Upload Dataset") status, X_train, Y_train, X_test, Y_test, last = LOAD.load_external_csv_dataset( dataset, split_percent, "Classification") if status == "VALID DATASET" and dataset: st.sidebar.success(status) train_data, test_data = X_train.copy(), X_test.copy() train_data[last], test_data[last] = Y_train, Y_test st.sidebar.dataframe(train_data.head(10)) st.sidebar.text(f"Training data size: {train_data.shape}") st.sidebar.text(f"Testing data size: {test_data.shape}") k = top.slider("K", 1, 20) TB = top.button("Train") if TB: #Z_test is the prediction Z_test = KNN.predict_knn(X_train, Y_train, X_test, k) accuracy = KNN.accuracy(Z_test, Y_test) middle.text("Testing Accuracy : {:.2%}".format(accuracy)) #adding last column again test_data["Prediction for " + last] = np.squeeze(Z_test) middle.dataframe(test_data.head(10)) plot_confusion_matrix(test_data[last], test_data["Prediction for " + last]) plot(train_data, test_data, last, algo_type) st.balloons() elif status and dataset: st.sidebar.error(status) st.stop()