def prob_3(weighted_d = False): test_arff = Arff("housing_testing_data.arff") train_arff = Arff("housing_training_data.arff") test_arff.shuffle() train_arff.shuffle() test_arff.normalize() train_arff.normalize() K = [1, 3, 5, 7, 9, 11, 13, 15] A = [] for k_hat in K: test_data = np.hstack((test_arff.get_features().data, test_arff.get_labels().data)) train_data = np.hstack((train_arff.get_features().data, train_arff.get_labels().data)) KNNC = KNNClassifier(k_hat, train_data, test_data) A.append(KNNC.get_accuracy_regress(weighted_d)) plt.plot(K, A, label="") t = "KNN Regression M.S.E Housing" if weighted_d: t += "(weighted-d)" weighted_d plt.title(t) plt.xlabel("K") plt.ylabel("M.S.E") # plt.legend() plt.show()
def prob4h(): arff = Arff('datasets/abalone.arff', label_count=0) arff.normalize() domain = np.arange(2, 8) print('single link --------------------') hoc = HAC() hoc.train(arff, printk=domain, silhouette=True) print('complete link -----------------------') hoc = HAC(simple=False) hoc.train(arff, printk=domain, silhouette=True)
def prob4(): arff = Arff('datasets/abalone.arff', label_count=0) arff.normalize() domain = np.arange(2, 8) ssekmm = [] for k in domain: km = KMeans(k) ssek = km.train(arff) ssekmm.append(ssek) print(km.calc_silhouette_score())
def test_cases(): # test_1() attr_types = [ "real", "real", "real", "real", "cat", "real", "cat", "real", "real", "cat", "real", "cat", "cat", "cat", "cat", "cat", "cat" ] attr_idx = [ [], [], [], [], ['none','tcf','tc'], [], ['none','ret_allw','empl_contr'], [], [], ['yes','no'], [], ['below_average','average','generous'], ['yes','no'], ['none','half','full'], ['yes','no'], ['none','half','full'], ['bad','good'] ] k = 5 arff = Arff("labor.arff") arff.normalize() features = arff.get_features().data labels = arff.get_labels().data # attributes = arff.get_attr_names() data = np.hstack((features, labels))[:, 1:] kmc = KMC(k, data, data, attr_types, attr_idx) kmc.train(tol=0)
def prob_5(): cont_mask = [1, 2, 7, 10, 13, 14, 16] cate_mask = [0, 3, 4, 5, 6, 8, 9, 11, 12, 15] arff = Arff("credit_approval_data.arff") arff.shuffle() arff.normalize() n = len(arff.get_labels().data) t = int(n * .7) train_data = arff.create_subset_arff(row_idx=slice(0, t, 1)) test_data = arff.create_subset_arff(row_idx=slice(t, n, 1)) test_data = np.hstack((test_data.get_features().data, test_data.get_labels().data)) train_data = np.hstack((train_data.get_features().data, train_data.get_labels().data)) #b,30.83,0,u,g,w,v,1.25,t,t,01,f,g,00202,0,+ dist_matrix = np.ones((16, 16)) np.fill_diagonal(dist_matrix, 0) KNNC = KNNClassifier(8, train_data, test_data) print(KNNC.get_accuracy_mixed(cate_mask, cont_mask, dist_matrix))
def prob_2(weighted_d = False): """ """ k = 3 test_arff = Arff("magic_telescope_testing_data.arff") train_arff = Arff("magic_telescope_training_data.arff") test_arff.shuffle() train_arff.shuffle() # attributes = test_arff.get_attr_names() test_data = np.hstack((test_arff.get_features().data, test_arff.get_labels().data)) train_data = np.hstack((train_arff.get_features().data, train_arff.get_labels().data)) KNNC = KNNClassifier(k, train_data, test_data) acc = KNNC.get_accuracy(weighted_d) test_arff.normalize() train_arff.normalize() n_test_data = np.hstack((test_arff.get_features().data, test_arff.get_labels().data)) n_train_data = np.hstack((train_arff.get_features().data, train_arff.get_labels().data)) n_KNNC = KNNClassifier(k, n_test_data, n_train_data) acc_n = n_KNNC.get_accuracy(weighted_d) # print(np.array([[acc,acc_n]])) print(acc,acc_n) # show_table(["Not Normalized" "Normailzed"], ["Accuracy"], np.array([[acc,acc_n]]), title = "Normalized vs Non-normalized, k=3") K = [1, 3, 5, 7, 9, 11, 13, 15] A = [] for k_hat in K: # n_test_data = np.hstack((test_arff.get_features().data, test_arff.get_labels().data)) # n_train_data = np.hstack((train_arff.get_features().data, train_arff.get_labels().data)) n_KNNC = KNNClassifier(k_hat, n_train_data, n_test_data) A.append(n_KNNC.get_accuracy(weighted_d)) plt.plot(K, A, label="") t = "KNN Accuracy Telesc. " if weighted_d: t += "(weighted-d)" plt.title(t) plt.xlabel("K") plt.ylabel("Accuracy") # plt.legend() plt.show()
def prob_6(): """ """ k = 3 test_arff = Arff("magic_telescope_testing_data.arff") train_arff = Arff("magic_telescope_training_data.arff") test_arff.shuffle() train_arff.shuffle() test_arff.normalize() train_arff.normalize() K = [1, 3, 5] T = [] A = [] T_KSM = [] A_KSM = [] for k_hat in K: test_data = np.hstack((test_arff.get_features().data, test_arff.get_labels().data)) train_data = np.hstack((train_arff.get_features().data, train_arff.get_labels().data)) KNNC = KNNClassifier(k_hat, train_data, test_data) t = time.time() A.append(KNNC.get_accuracy()) T.append(time.time() - t) KNNC.induce_KSM() t = time.time() A_KSM.append(KNNC.get_accuracy()) T_KSM.append(time.time() - t) ax = plt.axes(projection='3d') ax.plot(K, A, T, label="No-KSM") ax.plot(K, A_KSM, T_KSM, label="KSM") ax.set_xlabel('K') ax.set_ylabel('Accuracy') ax.set_zlabel('Time') t = "KNN Accuracy w/ IKSM" plt.title(t) plt.legend() plt.show()
def prob_3(): # Use regression knn on housing price prediction dataset train = Arff('datasets/housing_train.arff') test = Arff('datasets/housing_test.arff') train.normalize() test.normalize() krange = np.arange(1, 16, 2) mses = [] for k in krange: knn = KNN(k) preds = knn.knn(train.get_features(), train.get_labels(), test.get_features()) mse = sum((preds - np.ravel(test.get_labels().data))**2) / len(preds) mses.append(mse) plt.plot(krange, mses) plt.title("K Size Versus MSE on Housing Prices") plt.xlabel("K") plt.ylabel("Mean Squared Error") plt.show()
def prob_4_telescope(): # Repeat experiments for magic telescope and housing using weights (w = 1/dist**2) train = Arff('datasets/magic_telescope_train.arff') test = Arff('datasets/magic_telescope_test.arff') train.normalize() test.normalize() krange = np.arange(1, 16, 2) accs = [] for k in krange: knn = KNN(k, weighting=True) predictions = knn.knn(train.get_features(), train.get_labels(), test.get_features()) acc = predictions == np.ravel(test.get_labels().data) print("k:", k, "accuracy:", sum(acc) / len(acc)) accs.append(sum(acc) / len(acc)) plt.plot(krange, accs) plt.title("K Size Versus Accuracy") plt.xlabel("K") plt.ylabel("Accuracy") plt.show()
def prob_2(): # try first without normalizing train = Arff('datasets/magic_telescope_train.arff') test = Arff('datasets/magic_telescope_test.arff') k = KNN(3) predictions = k.knn(train.get_features(), train.get_labels(), test.get_features()) acc = predictions == np.ravel(test.get_labels().data) print("Before normalization:", sum(acc) / len(acc)) train.normalize() test.normalize() predictions = k.knn(train.get_features(), train.get_labels(), test.get_features()) acc = predictions == np.ravel(test.get_labels().data) print("After normalization:", sum(acc) / len(acc)) print("PART TWO:") krange = np.arange(1, 16, 2) accs = [] for k in krange: knn = KNN(k) predictions = knn.knn(train.get_features(), train.get_labels(), test.get_features()) acc = predictions == np.ravel(test.get_labels().data) print("k:", k, "accuracy:", sum(acc) / len(acc)) accs.append(sum(acc) / len(acc)) plt.plot(krange, accs) plt.title("K Size Versus Accuracy") plt.xlabel("K") plt.ylabel("Accuracy") plt.show()
def prob_4_housing(): # Repeat experiments for magic telescope and housing using weights (w = 1/dist**2) train = Arff('datasets/housing_train.arff') test = Arff('datasets/housing_test.arff') train.normalize() test.normalize() krange = np.arange(1, 16, 2) mses = [] for k in krange: knn = KNN(k, weighting=True) preds = knn.knn_regression(train.get_features(), train.get_labels(), test.get_features()) mse = np.sum( (preds - np.ravel(test.get_labels().data))**2, axis=0) / len(preds) mses.append(mse) plt.plot(krange, mses) plt.title("K Size Versus MSE on Housing (Weighted)") plt.xlabel("K") plt.ylabel("Mean Squared Error") plt.show()
def prob3_normalized(): arff = Arff('datasets/abalone.arff', label_count=0) arff.normalize() domain = np.arange(2, 8) ssekmm = [] for k in domain: km = KMeans(k) ssek = km.train(arff) ssekmm.append(ssek) hac = HAC() hac2 = HAC(simple=False) ssehac = hac.train(arff, printk=domain) ssehac2 = hac2.train(arff, printk=domain) plt.plot(domain, ssekmm, label="K-Means SSE") plt.plot(domain, ssehac[::-1], label="HAC (Single-Link) SSE") plt.plot(domain, ssehac2[::-1], label="HAC (Complete-Link) SSE") plt.title("Abalone SSE (Normalized) vs # of Clusters") plt.xlabel("# of Clusters") plt.ylabel('SSE') plt.legend() plt.show()