コード例 #1
0
ファイル: main.py プロジェクト: julianweisbord/cs_434
def main():

    X_train, Y_train, X_test, Y_test = knn.load_data()
    print("X_train[0]", X_train[0])
    print(X_test)
    # Normalize, each feature will have a value between 0 and 1
    new_dim = np.newaxis
    X_train = X_train / X_train.sum(axis=1)[:, new_dim]
    # count = 0
    # for row in X_train:
    #     for feature in row:
    #         if float(feature) > 0.5:
    #             count +=1
    # print("Num train features greater than .5", count)
    X_test = X_test / X_test.sum(axis=1)[:, new_dim]
    print("X_train after normalize: ", X_train[0])
    train_normal = np.append(X_train, Y_train, axis=1)
    print("train[0]: ", train_normal[0])
    test_normal = np.append(X_test, Y_test, axis=1)
    print("test[0]: ", test_normal[0])
    #
    # # Implement the K-nearest neighbor algorithm, where K is a parameter.
    #
    K = knn.gen_k_values()
    train_err = knn.accuracy(train_normal, train_normal, K)
    print(train_err)

    # Part 2 of Question 1:
    train_error = knn.accuracy(train_normal, train_normal, K)

    temp = [[] for _ in range(len(test_normal))]
    cross_error = []

    for i, example in enumerate(train_normal):

        train_normal = np.delete(train_normal, i, 0)
        temp[i] = knn.accuracy(train_normal, example, K, leave_out=True)
        train_normal = np.insert(train_normal, i, example, 0)

    for total in np.sum(temp, axis=0):
        cross_error.append(np.float32(total) / (len(test_normal) -1))

    print("!!!!!!Cross results", cross_error)

    test_error = knn.accuracy(train_normal, test_normal, K)

    print("!!!!!!training results", train_error)
    print("!!!!!!testing results", test_error)
    print(len(train_error), len(test_error))


    if PLOT:
        plt.plot(K, train_error)
        plt.plot(K, test_error)
        print("len cross: ", len(cross_error))
        plt.plot(K, cross_error)
        plt.xlabel('K values')
        plt.ylabel('Error')
        plt.show()
コード例 #2
0
def knn_get_both_accuracies(k, xTrain, yTrain, xTest, yTest):
    model = knn.Knn(int(k))
    model.train(xTrain, yTrain['label'])
    # predict the training dataset
    yHatTrain = model.predict(xTrain)
    trainAcc = knn.accuracy(yHatTrain, yTrain['label'])
    # predict the test dataset
    yHatTest = model.predict(xTest)
    testAcc = knn.accuracy(yHatTest, yTest['label'])
    return trainAcc, testAcc
コード例 #3
0
def main():
    """
    Main file to run from the command line.
    """
    # set up the program to take in arguments from the command line
    parser = argparse.ArgumentParser()
    parser.add_argument("--xTrain",
                        default="q3xTrain.csv",
                        help="filename for features of the training data")
    parser.add_argument(
        "--yTrain",
        default="q3yTrain.csv",
        help="filename for labels associated with training data")
    parser.add_argument("--xTest",
                        default="q3xTest.csv",
                        help="filename for features of the test data")
    parser.add_argument(
        "--yTest",
        default="q3yTest.csv",
        help="filename for labels associated with the test data")

    args = parser.parse_args()
    # load the train and test data
    xTrain = pd.read_csv(args.xTrain)
    yTrain = pd.read_csv(args.yTrain)
    xTest = pd.read_csv(args.xTest)
    yTest = pd.read_csv(args.yTest)
    # create an instance of the model

    perf = []
    # the different versions of k to try
    for k in range(1, 20, 2):
        model = knn.Knn(k)
        model.train(xTrain, yTrain['label'])
        yHatTrain = model.predict(xTrain)
        trainAcc = knn.accuracy(yHatTrain, yTrain['label'])
        yHatTest = model.predict(xTest)
        testAcc = knn.accuracy(yHatTest, yTest['label'])
        perf.append([k, trainAcc, testAcc])

    perfDF = pd.DataFrame(perf, columns=["k", "train", "test"])
    print(perfDF)
    perfDF = perfDF.set_index("k")
    sns.set(style="whitegrid")
    # also do a plot
    snsPlot = sns.lineplot(data=perfDF, palette="tab10", linewidth=2.5)
    snsfigure = snsPlot.get_figure()

    snsfigure.savefig("q3d.png")
コード例 #4
0
def knn_train_test(k, xTrain, yTrain, xTest, yTest):
    """
    Given a specified k, train the knn model and predict
    the labels of the test data. Returns the accuracy of
    the resulting model.

    Parameters
    ----------
    k : int
        The number of neighbors
    xTrain : nd-array with shape n x d
        Training data 
    yTrain : 1d array with shape n
        Array of labels associated with training data.
    xTest : nd-array with shape m x d
        Test data 
    yTest : 1d array with shape m
        Array of labels associated with test data.

    Returns
    -------
    acc : float
        The accuracy of the trained knn model on the test data
    """
    model = knn.Knn(k)
    model.train(xTrain, yTrain['label'])
    # predict the test dataset
    yHatTest = model.predict(xTest)
    return knn.accuracy(yHatTest, yTest['label'])
コード例 #5
0
def test_accuracy():
    test_dataframe = knn.testing
    test_data = knn.df_testing
    training_data = knn.df_data
    k = 3
    assert knn.accuracy(test_dataframe, test_data, training_data,
                        k) == '60.0%', "Accuracy calculated wrong"
    return
コード例 #6
0
def cross_validate(features, labels):
    error = 0.0
    for fold in range(10):
        training = np.ones(len(features), bool)
        training[fold::10] = 0
        testing = ~training
        model = learn_model(1, features[training], labels[training])
        test_error = accuracy(features[testing], labels[testing], model)
        error += test_error
    return error / 10.0
def cross_validation(features, labels):
    error = 0.0
    for fold in range(10):
        training = np.ones(len(features), bool)
        training[fold::10] = 0
        testing = ~training
        model = learn_model(1, features[training], labels[training])
        test_error = accuracy(features[testing], labels[testing], model)
        error += test_error
    return error / 10.0
コード例 #8
0
def oneFold():
    # masks for training and testing
    training = np.ones(len(features), bool)
    # sample
    training[1::4] = 0
    testing = ~training

    k = 1
    model = fit_model(k, features[training], labels[training])
    accr = accuracy(model, features[testing], labels[testing])
    print 'Aprox Accuracy was{0:.1%}'.format(accr)
コード例 #9
0
def cross_validate(features, labels):

    k = 1
    accr = 0.0
    nFolds = 10
    for fold in range(nFolds):
        training = np.ones(len(features), bool)
        # unsample every nFold
        training[fold::nFolds] = 0
        testing = ~training
        model = fit_model(k, features[training], labels[training])
        accr += accuracy(model, features[testing], labels[testing])
    return accr / nFolds
コード例 #10
0
ファイル: knncv.py プロジェクト: joskid/machine-learning-5
def cross_validate(train_data, train_labels, k, distance, F=5, prints=True):
    """
	Performs f-fold cross validation on the specified training set. Returns an array storing
	all cross-validation accuracies
	"""
    # number of training instances in each cross-validation subset
    C = train_data.shape[0] // F
    # initialize empty array to store cross-validation accuracies
    accuracy = np.zeros(F)
    # for each round of cross-validation
    for f in range(F):
        # create indices for the validation set
        validation_index = np.arange(f * C, (f + 1) * C)
        # create indeices for the training set
        train_index = np.setdiff1d(np.arange(0, train_data.shape[0]),
                                   validation_index)
        # obtain predicted labels for the images in the validation set
        predicted_labels = knn.classify(train_data[train_index],
                                        train_labels[train_index],
                                        train_data[validation_index], k,
                                        distance)
        # compute confusion matrix for validation set
        con_matrix = knn.confusion_matrix(train_labels[validation_index],
                                          predicted_labels)
        # convert to pandas data frame to label rows and columns, then print
        # suppressed when performing cross-validation for multiple values of k
        if prints:
            con_mat_df = pd.DataFrame(con_matrix,
                                      index=['1', '2', '7'],
                                      columns=['1', '2', '7'])
            print('Cross-validation round', f + 1)
            print(
                'Confusion matrix: Predicted classes along horizontal axis. Actual classes along vertical axis.'
            )
            print(con_mat_df)
        # compute and store cross-validation accuracy
        accuracy[f] = knn.accuracy(con_matrix)
    return accuracy
コード例 #11
0
ファイル: orient.py プロジェクト: ATarfe/Image-Classification
        f.close()
        
    
elif model == 'nearest':
    
    file = open(input_file, "r")
    if train_or_test == 'train':
        train_data,train_names=knn.read_file(file)
        f = open(model_file, 'wb')
        pickle.dump(train_data, f, protocol=pickle.HIGHEST_PROTOCOL)
        f.close()

    else:
        f = open(model_file, 'rb')
        train_data = pickle.load(f)
        test_data,test_names=knn.read_file(file)	
        knn.accuracy(knn.knn(knn.euclidean(train_data,test_data), train_data), test_names)
        f.close()

elif model == 'forest':

    if train_or_test == 'train':
       forest.train(input_file, model_file)

    else:
        
        forest.test_forest(input_file, model_file)

                

コード例 #12
0
            Xtest,yTest,XtestID = myBoost.getDataFromFile(train_test_file)
            finalPredictions = myBoost.predict(Xtest)
            myBoost.writeToFile(XtestID,finalPredictions,'output.txt')
            print("Accuracy is: " ,sum(finalPredictions==yTest)/len(yTest))
        else:
            print("Untrained model being tested")

#train train-data.txt knn_model.txt knn
#test test-data.txt knn_model.txt knn
if model == 'knn' :
    
    if trainOrTest == 'train':
        knn.train(train_test_file,model_file)
        
    if trainOrTest == 'test':
        try:
            myKnn = open(model_file,'rb')
        except:
            print("output file has not been generated")
        
        finalPredictions,yTest,XtestID= knn.test(48,model_file ,train_test_file) 
        knn.writeToFile(XtestID,finalPredictions,'output.txt')
        print("Accuracy is: " ,knn.accuracy(finalPredictions,yTest))

    
    
    
    
    
    
    
コード例 #13
0
''' This is a script to run the classifier '''

from knn import KNN
from knn import accuracy

print("Iris flower predictions based on KNN:")

model = KNN('./data/iris.data')
predictions = model.run_classifier('./data/bezdekIris.data')
print("Predictions for test data set:", predictions)
accuracy = accuracy(predictions, './data/bezdekIris.data')
print("model accuracy:", accuracy)
コード例 #14
0
def totalAcuuracyGaussian1(): 
	sum_average = 0.0
	for	i in range(10):
		sum_average += knn.accuracy(totalGaussianClassification(folds[i]), data[classes[i]])/10.0
	return	sum_average
コード例 #15
0
def k_nearest_neighbor_model(algo_type):
    options = [
        "Iris Flower Classification", "Wine Quality Classification",
        "Heart Disease Chances Prediction", "Other"
    ]
    ds = st.sidebar.selectbox(
        "Choose Dataset (Choose Other to Upload External File)", options)
    split = st.sidebar.text_input(
        "Enter test split percentage between 1 and 100 ( e.g. 30 or 40)")
    split_percent = None
    dataset = False
    status = None
    if split:
        try:
            split_percent = int(split)
            if split_percent < 1 or split_percent > 100:
                st.stop()
        except:
            st.sidebar.error(
                "SPLIT PERCENTAGE MUST BE AN INTEGER (BETWEEN 1 TO 100)")
            st.stop()

    if ds == "Iris Flower Classification" and split_percent:
        dataset = True
        status, X_train, Y_train, X_test, Y_test, last = LOAD.load_internal_csv_dataset(
            "Iris.csv", split_percent, "Classification")
    elif ds == "Wine Quality Classification" and split_percent:
        dataset = True
        status, X_train, Y_train, X_test, Y_test, last = LOAD.load_internal_csv_dataset(
            "wine.csv", split_percent, "Classification")
    elif ds == "Heart Disease Chances Prediction" and split_percent:
        dataset = True
        status, X_train, Y_train, X_test, Y_test, last = LOAD.load_internal_csv_dataset(
            "heart.csv", split_percent, "Classification")
    elif ds == "Other" and split_percent:
        st.sidebar.markdown(
            """<ul><li>File Format: CSV</li><li>Must be preprocessed</li> <li>Must not contain any NAN values</li>
			<li>Must not contain invalid values (Combination of numeric and non-numeric)</li>
			<li>Assumption: First n-1 columns are numeric and last column (output) can be either numeric or non-numeric </li>""",
            unsafe_allow_html=True)

        dataset = st.sidebar.file_uploader("Upload Dataset")
        status, X_train, Y_train, X_test, Y_test, last = LOAD.load_external_csv_dataset(
            dataset, split_percent, "Classification")

    if status == "VALID DATASET" and dataset:
        st.sidebar.success(status)
        train_data, test_data = X_train.copy(), X_test.copy()
        train_data[last], test_data[last] = Y_train, Y_test

        st.sidebar.dataframe(train_data.head(10))
        st.sidebar.text(f"Training data size: {train_data.shape}")
        st.sidebar.text(f"Testing data size: {test_data.shape}")

        k = top.slider("K", 1, 20)
        TB = top.button("Train")

        if TB:

            #Z_test is the prediction
            Z_test = KNN.predict_knn(X_train, Y_train, X_test, k)
            accuracy = KNN.accuracy(Z_test, Y_test)
            middle.text("Testing Accuracy : {:.2%}".format(accuracy))
            #adding last column again
            test_data["Prediction for " + last] = np.squeeze(Z_test)

            middle.dataframe(test_data.head(10))
            plot_confusion_matrix(test_data[last],
                                  test_data["Prediction for " + last])
            plot(train_data, test_data, last, algo_type)
            st.balloons()

    elif status and dataset:
        st.sidebar.error(status)
        st.stop()