def knn_cross_val_score(X, y, k_list, score='accuracy', cv=None, **kwargs): """ :param X: train samples :param y: targets for train :param k_list: list of values of neighbors amount, in ascending order :param score: metric name( accuracy' must have) :param cv: list of tuples, which contains indices of train and valid samples :param kwargs: parameters for __init__ from KNNClassifier :return: dict, where keys is neigbors amount from k_list, values - numpy array of size len(cv) with accuracy on each fold """ if cv is None: cv = kfold(X.shape[0]) knn = None metric_per_k = {k: np.empty(0) for k in k_list} for fold in cv: if 'k' not in kwargs.keys(): knn = KNNClassifier(k_list[-1], **kwargs) else: knn = KNNClassifier(**kwargs) knn.fit(X[fold[0]], y[fold[0]]) knn.find_kneighbors(X[fold[1]], return_distance=True) for k in k_list: knn.k = k y_valid = knn.predict_for_cv(X[fold[1]]) if score == "accuracy": metric_per_k[k] = np.append(metric_per_k[k], accuracy(y_valid, y[fold[1]])) return metric_per_k
def knn_cross_val_score(X, y, k_list, score, cv=None, **kwargs): res = {} for k in k_list: knn = KNNClassifier(k, **kwargs) res[k] = [] kf = KFold(k) for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] knn.fit(X_train, y_train) my_y = knn.predict(X_test) correct = sum([1 for i, v in enumerate(my_y) if y_test[i] is v]) res[k].append(correct / len(my_y)) return res
def knn_cross_val_score(X, y, k_list, score='accuracy', cv=None, **kwargs): k_list = list(k_list) X = X.astype(float) y = y.astype(int) if cv is None: cv = kfold(len(X), 3) res = defaultdict(lambda: np.zeros(len(cv), dtype=float)) for t, (train, test) in enumerate(cv): model = KNNClassifier(k=k_list[-1], **kwargs) model.fit(X[train], y[train]) if not kwargs['weights']: neighbors = model.find_kneighbors(X[test], return_distance=False) weights = np.ones_like(neighbors, dtype=float) else: dists, neighbors = model.find_kneighbors(X[test], return_distance=True) weights = 1 / (dists + 1e-5) labels = y[train][neighbors] for i, (cur_labels, cur_w) in enumerate(zip(labels, weights)): cnt = np.zeros(cur_labels.max() + 1) maxa, maxy = 0, cur_labels[0] for prev_k, cur_k in zip([0] + k_list, k_list): for label, w in zip(cur_labels[prev_k:cur_k], cur_w[prev_k:cur_k]): cnt[label] += w if cnt[label] > maxa: maxa, maxy = cnt[label], label res[cur_k][t] += (maxy == y[test][i]) for k in k_list: res[k][t] /= len(test) return dict(res)
def knn_cross_val_score(X, y, k_list, score='accuracy', cv=None, **kwargs): if cv is None: cv = kfold(X.shape[0], 3) ans = {} for k in k_list: ans[k] = np.empty(len(cv)) knn = KNNClassifier(k=k_list[-1], **kwargs) for index, fold in enumerate(cv): knn.fit(X[fold[0], :], y[fold[0]]) curr_dist, curr_neighbors = knn.find_kneighbors(X=X[fold[1]], return_distance=True) if knn.weights: weights = (curr_dist + (10 ** (-5))) ** (-1) else: weights = np.ones(curr_neighbors.shape) classes = np.unique(y[fold[0]]) for k in k_list[-1::-1]: curr_weights = weights[:, :k] curr_neighbors = curr_neighbors[:, :k] res = online_predict(curr_weights, y[fold[0]], k, curr_neighbors, classes, fold) ans[k][index] = np.ones(res.shape[0], dtype=int)[res == y[fold[1]]].sum() \ / res.shape[0] return ans
def knn_cross_val_score(X, y, k_list, score, cv, **kwargs): if (cv is None): cv = kfold(np.size(X, 0), 3) score_dict = dict() curr_ind = list() weight_flag = kwargs['weights'] if (weight_flag): curr_weights = list() curr_score = np.empty(len(cv)) for j in range(len(cv)): classifier = KNNClassifier(k_list[len(k_list) - 1], **kwargs) classifier.fit(X[cv[j][0]], y[cv[j][0]]) y_knn = classifier.predict(X[cv[j][1]]) if (weight_flag): curr_w, curr_i = classifier.find_kneighbors(X[cv[j][1]], True) curr_weights.append(curr_w) curr_ind.append(curr_i) else: curr_i = classifier.find_kneighbors(X[cv[j][1]], False) curr_ind.append(curr_i) if (score == 'accuracy'): num_diff = np.sum(y_knn == y[cv[j][1]]) curr_score[j] = num_diff / np.size(y_knn) score_dict[k_list[len(k_list) - 1]] = curr_score for i in range(len(k_list) - 2, -1, -1): k_diff = k_list[i + 1] - k_list[i] curr_score = np.empty(len(cv)) for j in range(len(cv)): curr_ind[j] = curr_ind[j][:, :-k_diff] if (weight_flag): curr_weights[j] = curr_weights[j][:, :-k_diff] y_ind = y[cv[j][0]][curr_ind[j]] y_knn = np.empty(np.size(y_ind, 0)) for k in range(np.size(y_ind, 0)): y_knn[k] = np.argmax( np.bincount(y_ind[k], weights=1 / (curr_weights[j][k] + 1e-5))) else: y_ind = y[cv[j][0]][curr_ind[j]] y_knn = np.empty(np.size(y_ind, 0)) for k in range(np.size(y_ind, 0)): y_knn[k] = np.argmax(np.bincount(y_ind[k])) if (score == 'accuracy'): num_diff = np.sum(y_knn == y[cv[j][1]]) curr_score[j] = num_diff / np.size(y_knn) score_dict[k_list[i]] = curr_score return score_dict
def knn_cross_val_score(X, y, k_list, score, cv, **kwargs): if not cv: cv = kfold(X.shape[0], 2) result = {} max_k = k_list[len(k_list) - 1] for i, data in enumerate(cv): train = data[0] test = data[1] clf = KNNClassifier(k=max_k, **kwargs) clf.fit(X[train], y[train]) if clf.weights: dist, idx = clf.find_kneighbors(X[test], True) nearest_labels = y[train][idx] scores = [{} for j in range(dist.shape[0])] for k in range(1, max_k + 1): answers = np.zeros(dist.shape[0], dtype=np.int32) for j in range(dist.shape[0]): if nearest_labels[j, k - 1] not in scores[j].keys(): scores[j][nearest_labels[j, k - 1]] = 0 scores[j][nearest_labels[j, k - 1]] += 1 / (dist[j, k - 1] + 1e-5) answers[j] = max(scores[j], key=scores[j].get) if k in k_list: if k not in result.keys(): result[k] = np.zeros(len(cv)) result[k][i] = _accuracy(answers, y[test]) else: idx = clf.find_kneighbors(X[test], False) nearest_labels = y[train][idx] scores = [{} for j in range(test.shape[0])] for k in range(1, max_k + 1): answers = np.zeros(test.shape[0], dtype=np.int32) for j in range(test.shape[0]): if nearest_labels[j, k - 1] not in scores[j].keys(): scores[j][nearest_labels[j, k - 1]] = 0 scores[j][nearest_labels[j, k - 1]] += 1 answers[j] = max(scores[j], key=scores[j].get) if k in k_list: if k not in result.keys(): result[k] = np.zeros(len(cv)) result[k][i] = _accuracy(answers, y[test]) return result
def knn_cross_val_score_with_aug_for_train(X, y, new_objects_amount=10000, type_of_transformation='rotation', param_of_transformation=0, score='accuracy', k_list=None, cv=None, metric='cosine', strategy='brute', weights=True, k_folds=3 ): if cv is None: cv = kfold(X.shape[0], k_folds) knn = None metric_per_k = {k: np.empty(0) for k in k_list} for fold in cv: knn = KNNClassifier(k_list[-1], metric=metric, strategy=strategy, weights=weights) X_train_aug, y_train_aug = \ augmentation_tools.made_augmentation(X[fold[0]], y[fold[0]], new_objects_amount=new_objects_amount, type_of_transformation=type_of_transformation, param_of_transformation=param_of_transformation ) knn.fit(X_train_aug, y_train_aug) knn.find_kneighbors(X[fold[1]], return_distance=True) for k in k_list: knn.k = k y_valid = knn.predict_for_cv(X[fold[1]]) if score == "accuracy": metric_per_k[k] = np.append(metric_per_k[k], accuracy(y_valid.astype(int), y[fold[1]].astype(int) ) ) return metric_per_k
def knn_cross_val_score(x, y, k_list, score, cv, **kwargs): if cv == None: cv = kfold(x.shape[0], 3) answer = {} dop_answer = np.zeros(len(cv) * len(k_list)).reshape(len(k_list), -1) model = KNNClassifier(k=max(k_list), **kwargs) for j, data in enumerate(cv): train_set = x[data[0]] train_target = y[data[0]] test_set = x[data[1]] test_target = y[data[1]] model.fit(train_set, train_target) if kwargs['weights']: dist, ind = model.find_kneighbors(test_set, return_distance=True) votes = 1 / (dist + 0.00001) else: ind = model.find_kneighbors(test_set, return_distance=False) for i, k in enumerate(k_list): sub_answer = np.zeros(test_set.shape[0]) if kwargs['weights']: k_votes = votes[:, :k] k_ind = ind[:, :k] if kwargs['weights']: for q in range(len(test_set)): ind_array = np.zeros(10) for num in range(k_ind.shape[1]): ind_array[train_target[k_ind[q][num]].astype(int)] += k_votes[q][num] sub_answer[q] = np.argmax(ind_array) else: for q in range(len(test_set)): ind_array = np.zeros(10) for num in range(k_ind.shape[1]): ind_array[train_target[k_ind[q][num]].astype(int)] += 1 sub_answer[q] = np.argmax(ind_array) dop_answer[i][j] = 1 - len(np.where(test_target != sub_answer)[0]) / len(test_target) for i, k in enumerate(k_list): answer[k] = dop_answer[i] return answer
trX, teX, trY, teY = train_test_split(data, target, test_size=1/7, random_state=666) teX = teX.reshape(teX.__len__(), 28, 28) trX = trX.reshape(trX.__len__(), 28, 28) teX_rotated = np.empty(teX.shape) trX_rotated = np.empty(trX.shape) for enum in range(teX.__len__()): teX_rotated[enum] = gaussian(teX[enum], sqrt(1.1), preserve_range=True) for enum in range(trX.__len__()): trX_rotated[enum] = gaussian(trX[enum], sqrt(1.1), preserve_range=True) print("Here") teX = teX_rotated.reshape(teX_rotated.__len__(), 28 * 28) trX = trX_rotated.reshape(trX_rotated.__len__(), 28 * 28) model = KNNClassifier(4, "my_own", "cosine", True) model.fit(trX, trY) result = model.predict(teX) print("Accuracy of best method is: ", ac_s(teY, result)) f = open("save_file_4.txt", "w") for item in result: f.write(str(item))
def knn_cross_val_score(X, Y, k_list, score, cv, **kwargs): if X.__len__() != Y.__len__(): raise TypeError if cv is None: cv = kfold(X.__len__(), 3) if score == "accuracy": score_func = accuracy_score else: raise TypeError max_k = max(k_list) result = {} each_acc = np.empty([len(k_list), len(cv)]) for enumer, fold in enumerate(cv): trX, trY = X[fold[0]], Y[fold[0]].astype(int) teX, teY = X[fold[1]], Y[fold[1]].astype(int) model = KNNClassifier(k=max_k, **kwargs) model.fit(trX, trY) clusters = np.sort(np.unique(trY)) clusters_amount = np.unique(trY).__len__() distances, nearest = model.find_kneighbors(teX) test_target = np.empty(teX.__len__()).astype(int) for ite, it in enumerate(k_list): new_distances = distances[:, :it] new_nearest = nearest[:, :it] if "weights" in kwargs.keys(): if kwargs["weights"]: for enum in range(teX.__len__()): cluster_nb = np.zeros(clusters_amount) for numb in range(new_nearest.shape[1]): cluster_nb[trY[new_nearest[ enum, numb]]] += weight_function(new_distances[enum, numb]) test_target[enum] = clusters[np.argmax(cluster_nb)] else: for enum in range(teX.__len__()): cluster_nb = np.zeros(clusters_amount) for numb in range(new_nearest.shape[1]): cluster_nb[trY[new_nearest[enum, numb]]] += 1 test_target[enum] = clusters[np.argmax(cluster_nb)] else: for enum in range(teX.__len__()): cluster_nb = np.zeros(clusters_amount) for numb in range(new_nearest.shape[1]): cluster_nb[trY[new_nearest[enum, numb]]] += 1 test_target[enum] = clusters[np.argmax(cluster_nb)] each_acc[ite, enumer] = score_func(teY, test_target) for i in range(len(k_list)): result[k_list[i]] = each_acc[i] return result
fea_subsample_sizes = (10, 20, 100) strategies = ('my_own', 'brute', 'kd_tree', 'ball_tree') metrics = ('euclidean', ) times_fit = defaultdict(list) times_predict = defaultdict(list) times_fit_predict = defaultdict(list) for fea_subsample_size in fea_subsample_sizes: fea_idxs = np.random.randint(0, x.shape[1], [fea_subsample_size]) x_train_tmp, x_test_tmp = x_train[:, fea_idxs], x_test[:, fea_idxs] for strategy in strategies: for metric in metrics: delta_1 = [] delta_2 = [] for _ in range(1): model = KNNClassifier(k=5, strategy=strategy, metric=metric) st = time.time() model.fit(x_train_tmp, y_train) st_predict = time.time() model.find_kneighbors(x_test_tmp, return_distance=False) en = time.time() delta_1.append(st_predict - st) delta_2.append(en - st_predict) print("Stratery: ", strategy, " Metric: ", metric) print("Fit: ", list_mean(delta_1)) print("Predict: ", list_mean(delta_2)) print("Fit + Predict: ", list_mean(delta_1) + list_mean(delta_2)) times_fit[fea_subsample_size].append(list_mean(delta_1)) times_predict[fea_subsample_size].append(list_mean(delta_2)) times_fit_predict[fea_subsample_size].append( list_mean(delta_1) + list_mean(delta_2))
shuffle(all_descriptors) selected_20 = np.copy(all_descriptors[:20]) shuffle(all_descriptors) selected_100 = np.copy(all_descriptors[:100]) shuffle(all_descriptors) selected_200 = np.copy(all_descriptors[:200]) print(trX[:, selected_10].shape) print(trY.shape) print("10 selected descriptors are {}\n20 - {}\n100 - {}".format( selected_10, selected_20, selected_100)) model = KNNClassifier(5, "my_own", "euclidean", False, 100) model.fit(trX[:, selected_10], trY) result_10 = model.predict(teX[:, selected_10]) model.fit(trX[:, selected_20], trY) result_20 = model.predict(teX[:, selected_20]) model.fit(trX[:, selected_100], trY) result_100 = model.predict(teX[:, selected_100]) model.fit(trX[:, selected_200], trY) result_200 = model.predict(teX[:, selected_200]) print("Accuracy for: \n10 - {}\n20 - {}\n100 - {}\n200 - {}".format( ac_s(teY, result_10), ac_s(teY, result_20), ac_s(teY, result_100), ac_s(teY, result_200)))