def test_homo(self, gamma=3): n_trunc = 1000 kq, kd = 15, 30 lam = 0.2 diffusion = Diffusion(self.cross_gallery_fc) inverse = diffusion.get_laplacian_inverse(n_trunc, kd) knn = KNN(self.cross_gallery_fc, method='cosine') sims, ids = knn.search(self.cross_query_fc, kq) sims[sims < 0] = 0 sims /= np.sum(sims, axis=-1).reshape(-1, 1) sims = sims**gamma scores_qg = np.empty( (len(self.test_query_set), len(self.test_gallery_set)), dtype=np.float32) for i in range(len(self.test_query_set)): scores_qg[i] = (sims[i] @ inverse[ids[i]]) diffusion = Diffusion(self.cross_query_fc) inverse = diffusion.get_laplacian_inverse(n_trunc, kd) knn = KNN(self.cross_query_fc, method='cosine') sims, ids = knn.search(self.cross_gallery_fc, kq) sims[sims < 0] = 0 sims /= np.sum(sims, axis=-1).reshape(-1, 1) sims = sims**gamma scores_gq = np.empty( (len(self.test_gallery_set), len(self.test_query_set)), dtype=np.float32) for i in range(len(self.test_gallery_set)): scores_gq[i] = (sims[i] @ inverse[ids[i]]) scores = lam * scores_qg + (1 - lam) * scores_gq.T self.evaluate(-scores)
def tuning_with_scaling(self, distance_funcs, scaling_classes, x_train, y_train, x_val, y_val): f1 = -1 for j in scaling_classes: scaler1 = scaling_classes[j]() x_normal = scaler1(x_train) x_val_normal = scaler1(x_val) for i in distance_funcs: for k in range(1, 30, 2): a = KNN(k, distance_funcs[i]) a.train(x_normal, y_train) pointlabel = a.predict(x_val_normal) f1current = f1_score(y_val, pointlabel) if (f1current > f1): f1 = f1current best_k = k best_distance_function = i best_scaler = j best_model = KNN(best_k, distance_funcs[best_distance_function]) best_model.train(x_train, y_train) # assign the final values to these variables self.best_k = best_k self.best_distance_function = best_distance_function self.best_scaler = best_scaler self.best_model = best_model return best_k, best_distance_function, best_scaler, best_model
def test_knn(): df = pd.read_csv( 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None) y = df.iloc[0:100, 4].values y = np.where(y == 'Iris-setosa', -1, 1) y = np.random.randint(2, size=100) x = df.iloc[0:100, [0, 2]].values print("Testing 2-D Iris data set with only one neighbor...") neighbor = KNN(k=1) neighbor.fit(x, y) neighbor.plot(x, y) print("Testing Iris data set with 15 neighbor...") iris = datasets.load_iris() x = iris.data[:, :2] y = iris.target neighbor = KNN(15) neighbor.fit(x, y) y_pred = neighbor.predict(x) neighbor.accuracy(y_pred, y) neighbor.plot(x, y) print( "Adding new point to dataset and testing with full Iris 1-k data set..." ) neighbor = KNN(1) neighbor.fit(x, y) y2 = np.array([1]) y2 = np.append(y, y2) x2 = np.vstack([x, [5.0, 3.2]]) neighbor.plot(x2, y2) print("Testing SKLearn's model...") clf = neighbors.KNeighborsClassifier(1) clf.fit(x, y) plot_decision_regions(x2, y2, clf)
def main(n): print('Teste: ' + str(n)) teste = open('./particoes/ts' + str(n) + '.txt', 'r') arquivoTreino = './particoes/cj' + str(n) + '.txt' k = [1, 3, 5, 10] count = 0 hitDTW = [0, 0, 0, 0] hitEuclidiana = [0, 0, 0, 0] for testeLinha in teste: count += 1 #Conta quantas linhas ja foram executadas linhateste = testeLinha.split(" ") vetordeteste = list(map(float, linhateste[1:])) ResultadoDTW = KNN(arquivoTreino, vetordeteste, k).runKNN_DTW() ResultadoEuclidiana = KNN(arquivoTreino, vetordeteste, k).runKNN_Euclidiana() #Contador de acertos: pos = 0 for i in ResultadoDTW: if (int(linhateste[0]) == int(i[1])): hitDTW[pos] += 1 pos += 1 pos = 0 for i in ResultadoEuclidiana: if (int(linhateste[0]) == int(i[1])): hitEuclidiana[pos] += 1 pos += 1 print("Progresso" + str(n) + ": " + str(count * 100 / 240) + "%") #Indicador de progresso do programa #Relatorio dos testes: relatorio = open("./relatorios/relatorioFinal" + str(n) + ".txt", "w") relatorio.write('Teste: ' + str(n) + "\n") relatorio.write("Accuracy DTW K=1: " + str(hitDTW[0] * 100 / count) + "% \n") relatorio.write("Accuracy DTW K=3: " + str(hitDTW[1] * 100 / count) + "% \n") relatorio.write("Accuracy DTW K=5: " + str(hitDTW[2] * 100 / count) + "% \n") relatorio.write("Accuracy DTW K=10: " + str(hitDTW[3] * 100 / count) + "% \n") relatorio.write("Accuracy DTW TOTAL: " + str(( (hitDTW[0] + hitDTW[1] + hitDTW[2] + hitDTW[3]) * 100) / (4 * count)) + "% \n") relatorio.write("Accuracy Euclidiana K=1: " + str(hitEuclidiana[0] * 100 / count) + "% \n") relatorio.write("Accuracy Euclidiana K=3: " + str(hitEuclidiana[1] * 100 / count) + "% \n") relatorio.write("Accuracy Euclidiana K=5: " + str(hitEuclidiana[2] * 100 / count) + "% \n") relatorio.write("Accuracy Euclidiana K=10: " + str(hitEuclidiana[3] * 100 / count) + "% \n") relatorio.write("Accuracy Euclidiana TOTAL: " + str(((hitEuclidiana[0] + hitEuclidiana[1] + hitEuclidiana[2] + hitEuclidiana[3]) * 100) / (4 * count)) + "% \n")
def plot_knn(): np.random.seed(12345) fig, axes = plt.subplots(4, 4) for i, ax in enumerate(axes.flatten()): n_in = 1 n_out = 1 d = np.random.randint(1, 5) n_ex = np.random.randint(5, 500) std = np.random.randint(0, 1000) intercept = np.random.rand() * np.random.randint(-300, 300) X_train, y_train, X_test, y_test, coefs = random_regression_problem( n_ex, n_in, n_out, d=d, intercept=intercept, std=std, seed=i) LR = LinearRegression(fit_intercept=True) LR.fit(X_train, y_train) y_pred = LR.predict(X_test) loss = np.mean((y_test.flatten() - y_pred.flatten())**2) knn_1 = KNN(k=1, classifier=False, leaf_size=10, weights="uniform") knn_1.fit(X_train, y_train) y_pred_1 = knn_1.predict(X_test) loss_1 = np.mean((y_test.flatten() - y_pred_1.flatten())**2) knn_5 = KNN(k=5, classifier=False, leaf_size=10, weights="uniform") knn_5.fit(X_train, y_train) y_pred_5 = knn_5.predict(X_test) loss_5 = np.mean((y_test.flatten() - y_pred_5.flatten())**2) knn_10 = KNN(k=10, classifier=False, leaf_size=10, weights="uniform") knn_10.fit(X_train, y_train) y_pred_10 = knn_10.predict(X_test) loss_10 = np.mean((y_test.flatten() - y_pred_10.flatten())**2) xmin = min(X_test) - 0.1 * (max(X_test) - min(X_test)) xmax = max(X_test) + 0.1 * (max(X_test) - min(X_test)) X_plot = np.linspace(xmin, xmax, 100) y_plot = LR.predict(X_plot) y_plot_1 = knn_1.predict(X_plot) y_plot_5 = knn_5.predict(X_plot) y_plot_10 = knn_10.predict(X_plot) ax.scatter(X_test, y_test, alpha=0.5) ax.plot(X_plot, y_plot, label="OLS", alpha=0.5) ax.plot(X_plot, y_plot_1, label="KNN (k=1)", alpha=0.5) ax.plot(X_plot, y_plot_5, label="KNN (k=5)", alpha=0.5) ax.plot(X_plot, y_plot_10, label="KNN (k=10)", alpha=0.5) ax.legend() # ax.set_title( # "MSE\nLR: {:.2f} KR (poly): {:.2f}\nKR (rbf): {:.2f}".format( # loss, loss_poly, loss_rbf # ) # ) ax.xaxis.set_ticklabels([]) ax.yaxis.set_ticklabels([]) plt.tight_layout() plt.savefig("img/knn_plots.png", dpi=300) plt.close("all")
def rerank_results(feedback, similar_images, similar_image_vectors, query_image_vector): global feedback_imgs_g, feedback_vals_g, similar_images_g, similar_image_vectors_g similar_images_g = similar_images similar_image_vectors_g = similar_image_vectors # Add DT based relevance feedback function clf = DecisionTree() feedback_imgs = list(feedback.keys()) feedback_vals = list(feedback.values()) x_train_old, y_train = get_training_set(feedback_imgs, feedback_vals) x_train = [] for i in x_train_old: j = i.tolist() x_train.append(j) clf.fit(x_train, y_train) # x_test = similar_image_vectors_g.values() x_test = [] for i in similar_image_vectors_g.values(): j = i.tolist() x_test.append(j) predictions = clf.predict(x_test) #relevant images indices_rel = [i for i, x in enumerate(predictions) if x == 1] print("Relevant", indices_rel) x_train_knn_rel = [] rel_len = len(indices_rel) for i in indices_rel: x_train_knn_rel.append(x_test[i]) knn = KNN(rel_len) #knn = KNeighborsClassifier(n_neighbours=rel_len) knn.fit(x_train_knn_rel) neighbours_rel = knn.get_neighbours([query_image_vector]) print("Neighbours Rel", neighbours_rel) #irrelevant images indices_ir = [i for i, x in enumerate(predictions) if x == -1] print("Irrelevant", indices_ir) x_train_knn_ir = [] ir_len = len(indices_ir) for i in indices_ir: x_train_knn_ir.append(x_test[i]) knn = KNN(ir_len) knn.fit(x_train_knn_ir) neighbours_ir = knn.get_neighbours([query_image_vector]) print("Neighbours IR", neighbours_ir) ranked_indices = [] ranked_indices.extend(indices_rel) ranked_indices.extend(indices_ir) rel_similar_images = [ list(similar_image_vectors_g.keys())[index] for index in ranked_indices ] return rel_similar_images
def tuning_without_scaling(self, distance_funcs, x_train, y_train, x_val, y_val): """ In this part, you should try different distance function you implemented in part 1.1, and find the best k. Use k range from 1 to 30 and increment by 2. Use f1-score to compare different models. :param distance_funcs: dictionary of distance functions you must use to calculate the distance. Make sure you loop over all distance functions for each data point and each k value. You can refer to test.py file to see the format in which these functions will be passed by the grading script :param x_train: List[List[int]] training data set to train your KNN model :param y_train: List[int] train labels to train your KNN model :param x_val: List[List[int]] Validation data set will be used on your KNN predict function to produce predicted labels and tune k and distance function. :param y_val: List[int] validation labels Find(tune) best k, distance_function and model (an instance of KNN) and assign to self.best_k, self.best_distance_function and self.best_model respectively. NOTE: self.best_scaler will be None NOTE: When there is a tie, choose model based on the following priorities: Then check distance function [euclidean > minkowski > gaussian > inner_prod > cosine_dist] If they have same distance fuction, choose model which has a less k. """ # You need to assign the final values to these variables self.best_k = None self.best_distance_function = None self.best_model = None max_f1 = -1 for func_key in distance_funcs: for k in range(1, 30, 2): model = KNN(k, distance_funcs[func_key]) model.train(x_train, y_train) predicted_y = model.predict(x_val) current_f1 = f1_score(y_val, predicted_y) # handle tie!!! if current_f1 > max_f1: # print("replace") # print("current func key: " + str(func_key)) # print("best func key: " + str(self.best_distance_function)) max_f1 = current_f1 self.best_k = k self.best_distance_function = func_key self.best_model = model elif current_f1 == max_f1: # print("current func key: " + str(func_key)) # print("best func key: " + str(self.best_distance_function)) if self.distance_function_map[func_key] > self.distance_function_map[self.best_distance_function]: self.best_k = k self.best_distance_function = func_key self.best_model = model elif self.distance_function_map[func_key] == self.distance_function_map[self.best_distance_function]: self.best_k = min(self.best_k, k) self.best_model = KNN(self.best_k, distance_funcs[func_key])
def tuning_with_scaling(self, distance_funcs, scaling_classes, x_train, y_train, x_val, y_val): """ This part is the same as "tuning_without_scaling", except that you also need to try two different scalers implemented in Part 1.3. More specifically, before passing the training and validation data to KNN model, apply the scalers in scaling_classes to both of them. :param distance_funcs: dictionary of distance functions (key is the function name, value is the function) you need to try to calculate the distance. Make sure you loop over all distance functions for each k value. :param scaling_classes: dictionary of scalers (key is the scaler name, value is the scaler class) you need to try to normalize your data :param x_train: List[List[int]] training data set to train your KNN model :param y_train: List[int] train labels to train your KNN model :param x_val: List[List[int]] validation data :param y_val: List[int] validation labels Find the best k, distance_function (its name), scaler (its name), and model (an instance of KNN), and assign them to self.best_k, self.best_distance_function, best_scaler, and self.best_model respectively. NOTE: When there is a tie, choose the model based on the following priorities: First check scaler, prioritizing "min_max_scale" over "normalize" (which will also be the insertion order of scaling_classes). Then follow the same rule as in "tuning_without_scaling". """ f1_scores = [] ks = [] diss = [] scals = [] i = 0 for s_i in scaling_classes: scaling_class = scaling_classes[s_i]() x_train_new = scaling_class(x_train) x_val_new = scaling_class(x_val) for d_i in distance_funcs: for k_i in range(1, 30, 2): knn = KNN(k_i, distance_funcs[d_i]) knn.train(x_train_new, y_train) predicted_labels = knn.predict(x_val_new) real_labels = y_val f1_scores.append([f1_score(real_labels, predicted_labels), i]) ks.append(k_i) diss.append(d_i) scals.append(s_i) i += 1 indexes = sorted(f1_scores, key=lambda x: (x[0], -x[1])) # You need to assign the final values to these variables self.best_k = ks[indexes[len(indexes) - 1][1]] self.best_distance_function = diss[indexes[len(indexes) - 1][1]] distance_funcs = { 'euclidean': Distances.euclidean_distance, 'minkowski': Distances.minkowski_distance, 'cosine_dist': Distances.cosine_similarity_distance, } self.best_model = KNN(self.best_k, distance_funcs[self.best_distance_function]) self.best_model.train(x_train, y_train) self.best_scaler = scals[indexes[len(indexes) - 1][1]]
def testscale(self, scale, trial=None): backup = self.data, self.testdata self.data, self.testdata = self.rescale(self.data, scale), self.rescale( self.testdata, scale) self.knn = KNN(self.data) result = self.validation(trial) self.data, self.testdata = backup self.knn = KNN(self.data) return result
def main(argv): knn = KNN('iris.data', 'iris.test', 4) training_set = knn.get_training_set() testing_set = knn.get_testing_set() print('**********\n***** IRIS ****\n***********') print_result(knn, training_set, testing_set) knn = KNN('wdbc.data', 'wdbc.test', 1, 0) print('**********\n***** Breast Cancer in Wisconsin ****\n***********') training_set = knn.get_training_set() testing_set = knn.get_testing_set() print_result(knn, training_set, testing_set)
def tuning_without_scaling(self, distance_funcs, x_train, y_train, x_val, y_val): """ In this part, you should try different distance function you implemented in part 1.1, and find the best k. Use k range from 1 to 30 and increment by 2. Use f1-score to compare different models. :param distance_funcs: dictionary of distance functions you must use to calculate the distance. Make sure you loop over all distance functions for each data point and each k value. You can refer to test.py file to see the format in which these functions will be passed by the grading script :param x_train: List[List[int]] training data set to train your KNN model :param y_train: List[int] train labels to train your KNN model :param x_val: List[List[int]] Validation data set will be used on your KNN predict function to produce predicted labels and tune k and distance function. :param y_val: List[int] validation labels Find(tune) best k, distance_function and model (an instance of KNN) and assign to self.best_k, self.best_distance_function and self.best_model respectively. NOTE: self.best_scaler will be None NOTE: When there is a tie, choose model based on the following priorities: Then check distance function [canberra > minkowski > euclidean > gaussian > inner_prod > cosine_dist] If they have same distance fuction, choose model which has a less k. """ num_k = 0 F1_stats = [] distance_funcs_lists = [ 'canberra', 'minkowski', 'euclidean', 'gaussian', 'inner_prod', 'cosine_dist' ] for i in range(len(distance_funcs_lists)): k = 1 while k < 30 and k <= len(x_train): classfier = KNN(k, distance_funcs[distance_funcs_lists[i]]) classfier.train(x_train, y_train) preds = classfier.predict(x_val) F1_stats.append(f1_score(y_val, preds)) k += 2 num_k += 1 num_k //= 6 ind = F1_stats.index(max(F1_stats)) self.best_k = (ind % num_k) * 2 + 1 self.best_distance_function = distance_funcs_lists[ind // num_k] self.best_model = KNN(self.best_k, distance_funcs[self.best_distance_function]) self.best_model.train(x_train, y_train)
def test_compare_to_scikit_learn_changing_k(self): normalizer = Normalizer(self.data) data = normalizer.normalize() testSize = 100 trainSize = len(data.data) - testSize for i in range(1, 12): with self.subTest(i=i): print("k: ", i) neighbours = i trainData = {} testData = {} trainData['data'] = data.data[:trainSize] trainData['target'] = data.target[:trainSize] testData['data'] = data.data[trainSize:] testData['target'] = data.target[:trainSize] knn = KNN(trainData) #scikit-learn model: model = KNeighborsClassifier(n_neighbors=neighbours) model.fit(trainData['data'], trainData['target']) ourCounter = 0 sciCounter = 0 for i, e in enumerate(testData['data']): if knn.makeGuess(e, neighbours) == testData['target'][i]: ourCounter+=1 if model.predict([e]) == testData['target'][i]: sciCounter+=1 self.assertAlmostEqual(ourCounter/(testSize), sciCounter/(testSize), 3)
def question_4(points): """ question 4 :param points: list of Point """ k_list = [5, 7] normalization_list = [[DummyNormalizer, "DummyNormalizer"], [SumNormalizer, "SumNormalizer"], [MinMaxNormalizer, "MinMaxNormalizer"], [ZNormalizer, "ZNormalizer"]] print("Question 4:") for k in k_list: print("K=", k, sep="") m = KNN(k) m.train(points) cv = CrossValidation() for i in normalization_list: normalize_object = i[0]() normalize_object.fit(points) new_points = normalize_object.transform(points) # 2 is the best n-fold average_score = cv.run_cv(new_points, 2, m, accuracy_score, False, True) formatted_average_score = "{:.2f}".format(average_score) print("Accuracy of", i[1], "is", formatted_average_score) print()
def execute(): df = pd.read_csv('data/reduced_dataset_v3.csv') df = df.drop( ['Unnamed: 0', 'Time', 'Time Code', 'Country Name', 'Country Code'], axis=1) sbs_reg = StepByStepRegression(df, 'correlation') # print(emp_df.columns) sbs_reg.normalize() # print(normalized_df.head()) sorted_columns = sbs_reg.get_least_nan_columns(sbs_reg.normalized_df) # print('sorted columns', sorted_columns) sorted_df = pd.DataFrame() correlated_features_list = dict() for col, nulls in sorted_columns.iteritems(): sorted_df[col] = sbs_reg.normalized_df[col] # print('sorted dataframe', sorted_df.columns) if sbs_reg.sim_fun == 'KNN': knn = KNN(sorted_df, 3) correlated_features_list = knn.findKNeighbors() elif sbs_reg.sim_fun == 'correlation': correlated_features_list = sbs_reg.correlation(sorted_df) print('correlated features', correlated_features_list) sbs_reg.fill_missing_data(sorted_df, correlated_features_list) un_normalized = sbs_reg.un_normalize(sorted_df) print(un_normalized)
def test_iris_regression(self): """ Tests kNN for regression """ k = 1 iris_dataset = load_iris() knn = KNN(k, "average") # get petal length as input # ensure this is 2D X = iris_dataset.data[:, 2].reshape(-1, 1) # get petal width as output y = iris_dataset.data[:, 3] knn.fit(X, y) predicted = knn.predict(X) # verify shape of output self.assertEqual(len(predicted.shape), 1) self.assertEqual(predicted.shape[0], iris_dataset.data.shape[0]) # with k=1, each point should match itself # but with only 1 dimension, some points have # the same values mse = mean_squared_error(y, predicted) self.assertLess(mse, 0.1)
def test_knn_regression(): while True: N = np.random.randint(2, 100) M = np.random.randint(2, 100) k = np.random.randint(1, N) ls = np.min([np.random.randint(1, 10), N - 1]) weights = np.random.choice(["uniform", "distance"]) X = np.random.rand(N, M) X_test = np.random.rand(N, M) y = np.random.rand(N) knn = KNN(k=k, leaf_size=ls, metric=euclidean, classifier=False, weights=weights) knn.fit(X, y) preds = knn.predict(X_test) gold = KNeighborsRegressor( p=2, leaf_size=ls, n_neighbors=k, weights=weights, metric="minkowski", algorithm="ball_tree", ) gold.fit(X, y) gold_preds = gold.predict(X_test) for mine, theirs in zip(preds, gold_preds): np.testing.assert_almost_equal(mine, theirs) print("PASSED")
def main(): trainSet = pd.read_csv('datasets/train_set.csv', converters={'Trajectory': literal_eval}) testSet = pd.read_csv('datasets/test_set_a2.csv', converters={'Trajectory': literal_eval}) # labels for categories le = preprocessing.LabelEncoder() categoryIds = le.fit_transform(trainSet['journeyPatternId']) allSequences = [] for trainIndex, trainRow in trainSet.iterrows(): allSequences.append(trainRow['Trajectory']) # initialize KNN classifier clf = KNN(5, DTW) crossValidation(clf, allSequences, categoryIds, le) clf.fit(allSequences, categoryIds) # predict the categories for the testSet predIds = clf.predict(testSet['Trajectory']) predCategs = le.inverse_transform(predIds) writeInCsv(predCategs)
def fit(self, X, y): # instantiate the input models rf = RandomForest(num_trees=15) knn = KNN(k=3) nb = NaiveBayes(num_classes=2) # Random Forest fit and predict rf.create_splits(X) rf.fit(X, y) rf_pred = rf.predict(X) # K-Nearest Neighbors fit and predict knn.fit(X, y) knn_pred = knn.predict(X) # Naive Bayes fit and predict nb.fit(X, y) nb_pred = nb.predict(X) # use predictions from input models as inputs for meta-classifiers meta_input = np.hstack((rf_pred.reshape( (rf_pred.size, 1)), knn_pred.reshape( (knn_pred.size, 1)), nb_pred.reshape((nb_pred.size, 1)))) # use Decision Tree as meta-classifier dt = DecisionTree(max_depth=np.inf) dt.fit(meta_input, y) self.rf = rf self.knn = knn self.nb = nb self.meta_classifier = dt
def test_blob_classification_numpy(self): """ Tests kNN for classification using randomly-generated points drawn from Gaussian-shaped clusters. Splits data into training and testing sets. """ k = 3 X, y = generate_cluster_samples() train_X, test_X, train_y, test_y = train_test_split(X, y, stratify=y) knn = KNN(k) knn.fit(train_X, train_y) pred_y = knn.predict_numpy(test_X) # verify shape of output self.assertEqual(len(pred_y.shape), 1) self.assertEqual(pred_y.shape[0], test_X.shape[0]) # with k=1, each point should match itself accuracy = accuracy_score(test_y, pred_y) self.assertAlmostEqual(accuracy, 1.0)
def main(): K = [1, 2, 3, 5, 7, 9, 11, 13, 15] #load CM1 data = arff.loadarff('./datasets/CM1.arff') X, Y = build_dataframe(data) # normalize data X = normalize_data(X) # create k-fold splits kf = KFold(n_splits=10) # instanciate classifier for k in K: clf = KNN(k=k) print("k equals {}".format(k)) start_time = time.time() acc = [] for train, test in kf.split(X): clf.fit(X.iloc[train], Y.iloc[train]) predictions = clf.predict(X.iloc[test]) acc.append((np.sum(predictions == Y.iloc[test]) / len(test)) * 100) end_time = time.time() acc = np.array(acc) print("mean accuracy: {}".format(np.mean(acc))) print("standard deviation: {}".format(np.std(acc))) print("time elapsed: {}".format(end_time - start_time))
def test_synthetic_data(self): """ Test KNN.predict using some synthetic data """ x_train = np.array([[1, 2], [1, 3], [2, 2], [2, 3], [1, 1], [2, 1]]) y_train = np.array([1, 1, 1, -1, -1, -1]) model = KNN(k=3) model.fit(x_train, y_train) x_test = np.array([ [1.8, 2.6], [2.0, 1.8], [1.5, 2.0], [1.0, 2.5], [1.5, 1.0], [2.0, 1.0], ]) pred = model.predict(x_test) self.assertTrue(np.array_equal(pred, np.array([1, 1, 1, 1, -1, -1]))) # one labels should change if using 1-nn model.k = 1 pred2 = model.predict(x_test) self.assertTrue(np.array_equal(pred2, np.array([-1, 1, 1, 1, -1, -1])))
def computeKNNCrossValidation(args, dict_algorithms): if (args.debug): print("Running knn...", end='') model = KNN(args) dict_algorithms["knn"] = model.computeCrossValidation() if (args.debug): print("ok!")
def run_knn(points): m = KNN(5) m.train(points) print(f'predicted class: {m.predict(points[0])}') print(f'true class: {points[0].label}') cv = CrossValidation() cv.run_cv(points, 10, m, accuracy_score)
def tuning_with_scaling(self, distance_funcs, scaling_classes, x_train, y_train, x_val, y_val): optimal_k = 0 function = "" scalar = "" model = None scalar_method = [] scalar_name = [] f1_scores = -2 ** 32 if len(x_train) <= 30: max_k = len(x_train) else: max_k = 30 for m, n in scaling_classes.items(): scalar_method.append(n()) scalar_name.append(m) for i in range(len(scalar_method)): x_train = scalar_method[i](x_train) x_val = scalar_method[i](x_val) for key, value in distance_funcs.items(): for k_value in range(1, max_k, 2): train_model = KNN(k_value, value) train_model.train(x_train, y_train) pre_val = train_model.predict(x_val) cur_f1 = f1_score(y_val, pre_val) if f1_scores < cur_f1: optimal_k = k_value function = key model = train_model f1_scores = cur_f1 scalar = scalar_name[i] self.best_k = optimal_k self.best_distance_function = function self.best_scaler = scalar self.best_model = model return self.best_k, self.best_distance_function, self.best_scaler, self.best_model
def main(): df = pd.read_csv(f".\Data\{args.dataset}") X = np.array(df.iloc[:, :-1]) y = np.array(df.iloc[:, -1]) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) knn = KNN(X_train, y_train, k=args.k) if args.type == "clf": y_pred = knn.predict(X_test, knn_type="clf") print(confusion_matrix(y_test, y_pred)) print(classification_report(y_test, y_pred)) elif args.type == "reg": y_pred = knn.predict(X_test, knn_type="reg") mse = np.mean((y_test - y_pred)**2) print(mse) else: return print("Undefined knn type") accuracy = np.mean(y_pred == y_test) print(accuracy)
def tuning_with_scaling(self, distance_funcs, scaling_classes, x_train, y_train, x_val, y_val): """ This part is the same as "tuning_without_scaling", except that you also need to try two different scalers implemented in Part 1.3. More specifically, before passing the training and validation data to KNN model, apply the scalers in scaling_classes to both of them. :param distance_funcs: dictionary of distance functions (key is the function name, value is the function) you need to try to calculate the distance. Make sure you loop over all distance functions for each k value. :param scaling_classes: dictionary of scalers (key is the scaler name, value is the scaler class) you need to try to normalize your data :param x_train: List[List[int]] training data set to train your KNN model :param y_train: List[int] train labels to train your KNN model :param x_val: List[List[int]] validation data :param y_val: List[int] validation labels Find the best k, distance_function (its name), scaler (its name), and model (an instance of KNN), and assign them to self.best_k, self.best_distance_function, best_scaler, and self.best_model respectively. NOTE: When there is a tie, choose the model based on the following priorities: First check scaler, prioritizing "min_max_scale" over "normalize" (which will also be the insertion order of scaling_classes). Then follow the same rule as in "tuning_without_scaling". """ # You need to assign the final values to these variables best_f1 = 0 for scaling_name, scaling_func in scaling_classes.items(): scaler = scaling_func() x_train_scaled = scaler(x_train) x_val_scaled = scaler(x_val) for name, func in distance_funcs.items(): for k in range(1, 30, 2): model = KNN(k, func) model.train(x_train_scaled, y_train) valid_f1 = f1_score(y_val, model.predict(x_val_scaled)) if valid_f1 > best_f1: self.best_distance_function = name self.best_k = k best_f1 = valid_f1 self.best_model = model self.best_scaler = scaling_name
def tuneK(dataMat, labels, folds, categ, cnnSub=None): ''' Tune for best K by finding K with the smallest classification error. The function loops through value of 1-10 and find the smallest K in that given range which produces the smallest K. The function randomly takes one fold of the 5-fold crossvalidation as the testing set for this tuning. The K with the smallest associated error is returned. ''' Ks = np.arange(1,11) # list of K valujes err = np.empty(len(Ks)) # pre-allocate erros pick = np.random.randint(len(folds)) # randomly pick fold as validation set trnIdx = np.hstack([x for n,x in enumerate(folds) if n != pick]) # train vldIdx = np.hstack([x for n,x in enumerate(folds) if n == pick]) # validate dataTrain,labelTrain = dataMat[trnIdx,:],labels[trnIdx] # training dataTest,labelTest = dataMat[vldIdx,:],labels[vldIdx] # validation if cnnSub is not None : dataTrain = dataMat[cnnSub] labelTrain = labels[cnnSub] for n,k in enumerate(Ks): # loop through K's pred = KNN(dataTrain, labelTrain, dataTest, k, categorical=categ) err[n] = errRate(pred, labelTest, categorical=categ) return Ks[np.argmin(err)], err # return K with smallest error
def tuning_without_scaling(self, distance_funcs, x_train, y_train, x_val, y_val): """ In this part, you need to try different distance functions you implemented in part 1.1 and different values of k (among 1, 3, 5, ... , 29), and find the best model with the highest f1-score on the given validation set. :param distance_funcs: dictionary of distance functions (key is the function name, value is the function) you need to try to calculate the distance. Make sure you loop over all distance functions for each k value. :param x_train: List[List[int]] training data set to train your KNN model :param y_train: List[int] training labels to train your KNN model :param x_val: List[List[int]] validation data :param y_val: List[int] validation labels Find the best k, distance_function (its name), and model (an instance of KNN) and assign them to self.best_k, self.best_distance_function, and self.best_model respectively. NOTE: self.best_scaler will be None. NOTE: When there is a tie, choose the model based on the following priorities: First check the distance function: euclidean > Minkowski > cosine_dist (this will also be the insertion order in "distance_funcs", to make things easier). For the same distance function, further break tie by prioritizing a smaller k. """ best_f1 = 0 for name, func in distance_funcs.items(): for k in range(1, 30, 2): model = KNN(k, func) model.train(x_train, y_train) valid_f1 = f1_score(y_val, model.predict(x_val)) if valid_f1 > best_f1: self.best_distance_function = name self.best_k = k best_f1 = valid_f1 self.best_model = model
def test_knn_clf(): while True: N = np.random.randint(2, 100) M = np.random.randint(2, 100) k = np.random.randint(1, N) n_classes = np.random.randint(10) ls = np.min([np.random.randint(1, 10), N - 1]) weights = "uniform" X = np.random.rand(N, M) X_test = np.random.rand(N, M) y = np.random.randint(0, n_classes, size=N) knn = KNN(k=k, leaf_size=ls, metric=euclidean, classifier=True, weights=weights) knn.fit(X, y) preds = knn.predict(X_test) gold = KNeighborsClassifier( p=2, leaf_size=ls, n_neighbors=k, weights=weights, metric="minkowski", algorithm="ball_tree", ) gold.fit(X, y) gold_preds = gold.predict(X_test) for mine, theirs in zip(preds, gold_preds): np.testing.assert_almost_equal(mine, theirs) print("PASSED")
def test_knn(k, train_data, train_labels, test_data): """ test_knn function Trains a KNN classifier with the given testing set then tests it on the testing data. Outputs as a CSV file. Args ---- k : integer number of neighbors to use for KNN train_data : np.array training dataset train_labels : np.array training dataset labels test_data : np.array testing dataset Returns ------- Tuple (np.array, np.array) """ print("Final k:" + str(k)) knn = KNN(k, train_data, train_labels) # print to CSV with open('predictions_digit_recognizer.csv', 'w', newline='') as csvfile: writer = csv.writer(csvfile) writer.writerow(['ImageId', 'Label']) for i in range(len(test_data)): data = test_data[i] guess = knn.classify(data) writer.writerow([str(i + 1), str(int(guess))])