import numpy as np import matplotlib.pyplot as plt from classify import classify from file_import import file_import from nearest_points import nearest_points_naive_l1 from nearest_points import nearest_points_naive_sup from nearest_points import nearest_points_opt_l1 from nearest_points import nearest_points_opt_sup from scipy.spatial import KDTree train = file_import("bananas-2-2d.train.csv") test = file_import("bananas-2-2d.test.csv") test_index = np.int(np.random.rand() * len(test)) k = 100 plt.figure(1) nearest = nearest_points_naive_sup(test[test_index, :], train, k) rest = [i for i in range(len(train)) if i not in nearest] plt.scatter(train[nearest, 1], train[nearest, 2], s=0.6, c='r') plt.scatter(train[rest, 1], train[rest, 2], s=0.6, c='k') plt.scatter(test[test_index, 1], test[test_index, 2], s=1.5, c='b') plt.figure(2) nearest = nearest_points_naive_l1(test[test_index, :], train, k) rest = [i for i in range(len(train)) if i not in nearest] plt.scatter(train[nearest, 1], train[nearest, 2], s=0.6, c='r') plt.scatter(train[rest, 1], train[rest, 2], s=0.6, c='k') plt.scatter(test[test_index, 1], test[test_index, 2], s=1.5, c='b') plt.figure(3) nearest = nearest_points_opt_sup(test[test_index, :], train, k)
def classify(nametest, nametrain, Kset, l): test = file_import(nametest) train = file_import(nametrain) D_test = test[:, 1:] D_train = train[:, 1:] n = len(train) großk = int(max(Kset)) index_array = np.zeros((n, großk), dtype=int) m_i = 0 tic = time.time() for i in range(l): block_size = n // l train_i = train[i * block_size:(i + 1) * block_size, :] a_i = train[0:i * block_size, :] b_i = train[(i + 1) * block_size:, :] train_strich_i = np.vstack((a_i, b_i)) D_train_i = train_i[:, 1:] #erste Spalte nicht dran c_i = D_train[0:i * block_size, :] d_i = D_train[(i + 1) * block_size:, :] D_train_strich_i = np.vstack((c_i, d_i)) index_array_i = np.zeros((m_i, großk), dtype=int) for j in range(0, len(D_train_i)): index_j_i = nearest_points_naive_sup_2(D_train_i[j, :], D_train_strich_i, großk) index_array[m_i + j, :] = index_j_i m_i = m_i + len(D_train_i) toc = time.time() print("%.10f seconds" % (toc - tic)) list_ks = [] tic = time.time() for k in Kset: errorarray = [] for i in range(l): m_i = len(D_train_i) block_size = n // l train_i = train[i * block_size:(i + 1) * block_size, :] a_i = train[0:i * block_size, :] b_i = train[(i + 1) * block_size:, :] train_strich_i = np.vstack((a_i, b_i)) D_train_i = train_i[:, 1:] #erste Spalte nicht dran c_i = D_train[0:i * block_size, :] d_i = D_train[(i + 1) * block_size:, :] D_train_strich_i = np.vstack((c_i, d_i)) index_array_i = np.zeros((m_i, großk), dtype=int) C_i = [] for j in range(0, m_i): if train_i[j, 0] == np.sign( np.sum(train_strich_i[index_array[m_i + j, :int(k)], 0])): c = 0 else: c = 1 C_i.append(c) m_i += len(D_train_i) error_classification_i = 1 / m_i * sum(C_i) errorarray.append(error_classification_i) middle_k = (1 / l) * sum(errorarray) list_ks.append(middle_k) toc = time.time() print("%.10f seconds" % (toc - tic)) print(list_ks) print( [np.abs(list_ks[i] - list_ks[i + 1]) for i in range(len(list_ks) - 1)]) print(list_ks.index(min(list_ks)))
def classify(file_name, KSET, l): tic = time.time() k_max = max(KSET) test = file_import( file_name + ".test.csv") # Vollständiges Array; Enthält Klassifikation train = file_import(file_name + ".train.csv") # dito n = train.shape[0] # Anzahl Punkte m = train.shape[ 1] # Anzahl Dimensionen; Beachte: Enthält die Klassifikation index_array = np.zeros( (n, k_max), dtype=int ) # Enthält alle Indizes der k_max nächsten Nachbarn aller Punkte block_size = n // l # Größe der D_i D_i_array = np.zeros( (l, block_size, m)) # Zu untersuchende Punkte von train D_strich_i_array = np.zeros( (l, block_size * (l - 1), m)) # Zu vergleichende Punkte von train for i in range(l): # Erzeuge alle benötigten Arrays an Punkten, i wird in der ersten Koordinate dieser Arrays indiziert D_i_array[i] = train[i * block_size:(i + 1) * block_size, :] lower_points = train[0:i * block_size, :] upper_points = train[(i + 1) * block_size:l * block_size, :] D_strich_i_array[i] = np.vstack((lower_points, upper_points)) toc = time.time() print("Initialisierung : %.10f seconds" % (toc - tic)) tic = time.time() for i in range(l): # Bestimme die k_max nächsten Nachbarn for j in range(0, block_size): index_array[block_size * i + j, :] = nearest_points_opt_l1( D_i_array[i, j, :], D_strich_i_array[i, :, :], k_max) # sic list_ks = [] toc = time.time() print("Nächste Nachbarn in train : %.10f seconds" % (toc - tic)) tic = time.time() new_array = np.zeros( (l, block_size, k_max) ) # Enthält Summen der Klassifikationen (ohne Signum) der n Punkte zu allen nächsten Nachbarn (bis k_max) for i in range(l): for j in range(block_size): new_array[i, j, :] = np.cumsum( D_strich_i_array[i, index_array[i * block_size + j, :], 0]) # new_array[i, j, 0] = np.sum(D_strich_i_array[i, index_array[i * block_size + j, 0], 0]) # for k in range(len(KSET) - 1): # new_array[i, j, k + 1] = new_array[i, j, k] + D_strich_i_array[i, index_array[i * block_size + j, k + 1], 0] temp1_array = np.sign(new_array) temp2_array = np.zeros((l, block_size, k_max)) for i in range(l): for j in range(block_size): for k in range(len(KSET)): if temp1_array[i, j, k] == 0: temp1_array[i, j, k] = 1 if D_i_array[i, j, 0] == temp1_array[i, j, k]: temp2_array[i, j, k] = 0 else: temp2_array[i, j, k] = 1 temp3_array = np.sum(temp2_array, 1) / block_size temp4_array = np.sum(temp3_array, 0) / l # print(temp4_array) toc = time.time() print("Bestimmung von k_stern : %.10f seconds" % (toc - tic)) k_stern = np.argmin(temp4_array) print("k_stern = " + str(k_stern)) print("Klassifikationsfehlerrate: " + str(temp4_array[k_stern])) o = len(test) test_classification = np.zeros(o) test_index_array = np.zeros((l, o, k_stern), dtype=int) tic = time.time() for i in range(l): for j in range(o): test_index_array[i, j, :] = nearest_points_opt_l1( test[j, :], D_strich_i_array[i, :, :], k_stern) toc = time.time() print("Nächste Nachbarn von test : %.10f seconds" % (toc - tic)) tic = time.time() for j in range(o): temp1 = 0 for i in range(l): temp2 = np.sign( np.sum(D_strich_i_array[i, test_index_array[i, j, :], 0])) temp1 += temp2 if temp2 == 0: temp1 += 1 test_classification[j] = np.sign( temp1 ) # Empirisch : Wird nicht Null, also keine weitere Abfrage nötig toc = time.time() print("Bestimmung der Klassifikation : %.10f seconds" % (toc - tic)) # print(test_classification) test[:, 0] = test_classification with open(file_name + ".result.csv", 'w', newline='') as csvfile: writer = csv.writer(csvfile) writer.writerows(test) # print(test) return test
import numpy as np import time from file_import import file_import from nearest_points import nearest_points_naive_l1 from nearest_points import nearest_points_naive_sup from nearest_points import nearest_points_opt_l1 from nearest_points import nearest_points_opt_sup from scipy.spatial import KDTree file_name = "ijcnn1.10000.train.csv" k = 1 data = file_import(file_name) n = data.shape[0] result_array_1 = np.zeros((n, k)) tic = time.time() for i in range(n): result_array_1[i, :] = nearest_points_naive_l1(data[i, :], data, k) toc = time.time() print("l1-naiv : %.10f seconds" % (toc - tic)) data = file_import(file_name) n = data.shape[0] result_array_2 = np.zeros((n, k)) tic = time.time() for i in range(n): result_array_2[i, :] = nearest_points_naive_sup(data[i, :], data, k) toc = time.time() print("lsup-naiv : %.10f seconds" % (toc - tic)) data = file_import(file_name)
import matplotlib.pyplot as plt from classify import classify from file_import import file_import import numpy as np file_name = "bananas-2-2d" plt.figure(1) set=file_import(file_name + ".train.csv") list_1=[i for i in range(len(set)) if set[i,0]==1] list_2=[i for i in range(len(set)) if set[i,0]==-1] plt.scatter(set[list_1, 1], set[list_1, 2], s=0.6, c="k", label="1") plt.scatter(set[list_2, 1], set[list_2, 2], s=0.6, c="r", label="-1") plt.title(file_name + ".train") plt.legend(markerscale =7.5, title="Klassifikation") plt.figure(2) set=file_import(file_name + ".test.csv") list_1=[i for i in range(len(set)) if set[i,0]==1] list_2=[i for i in range(len(set)) if set[i,0]==-1] plt.scatter(set[list_1, 1], set[list_1, 2], s=0.6, c="k", label="1") plt.scatter(set[list_2, 1], set[list_2, 2], s=0.6, c="r", label="-1") plt.title(file_name + ".test") plt.legend(markerscale =7.5, title="Klassifikation") plt.figure(3) #set = classify(file_name, np.arange(1,200), 5) set=file_import(file_name + ".result.csv") list_1=[i for i in range(len(set)) if set[i,0]==1] list_2=[i for i in range(len(set)) if set[i,0]==-1] plt.scatter(set[list_1, 1], set[list_1, 2], s=0.6, c="k", label="1")