def SequentialRadiusNeighborsClassifier(epsilon, X_train, X_test, Y_train): X_train_temp = np.copy(X_train) Y_train_temp = np.copy(Y_train) Reps = RadiusNeighborsClassifier(radius=epsilon) test_size = len(X_test) Y_predict = [-1 for x in range(test_size)] Y_current = list(set(Y_train)) test_index = [x for x in range(test_size)] for test_time in range(test_size): Knn_temp = NearestNeighbors(n_neighbors=1) Knn_temp.fit(X_train_temp) min_distances = Knn_temp.kneighbors(X_test[test_index])[0] min_distances = [np.mean(x) for x in min_distances] optimal_indice = min_distances.index(min(min_distances)) optimal_test = test_index[optimal_indice] test_index.remove(optimal_test) Reps.fit(X_train_temp, Y_train_temp) predict_set = Reps.radius_neighbors(X_test[optimal_test].reshape( 1, -1))[1] predict_set = predict_set[0] if predict_set.size > 0: y_predict = Reps.predict(X_test[optimal_test].reshape(1, -1)) y_predict = y_predict[0] else: y_predict = max(Y_current) + 1 Y_current.append(y_predict) Y_predict[optimal_test] = y_predict X_train_temp = np.append(X_train_temp, [X_test[optimal_test]], axis=0) Y_train_temp = np.append(Y_train_temp, [y_predict], axis=0) return Y_predict
def clusterFacetSamplesRNN(self, reduceRadius=3): """ cluster the samples of each facet using radius nearest neighbours the cluster center and their correspondent normals will be saved in self.objsamplepnts_refcls and self.objsamplenrmals_refcls :param: reduceRadius: the neighbors that fall inside the reduceradius will be removed :return: None author: weiwei date: 20161130, osaka """ self.objsamplepnts_refcls = np.ndarray(shape=(self.facets.shape[0], ), dtype=np.object) self.objsamplenrmls_refcls = np.ndarray(shape=(self.facets.shape[0], ), dtype=np.object) for i, facet in enumerate(self.facets): # print "cluster" # print i,len(self.facets) self.objsamplepnts_refcls[i] = [] self.objsamplenrmls_refcls[i] = [] X = self.objsamplepnts_ref[i] nX = X.shape[0] if nX > 0: neigh = RadiusNeighborsClassifier(radius=1.0) neigh.fit(X, range(nX)) neigharrays = neigh.radius_neighbors(X, radius=reduceRadius, return_distance=False) delset = set([]) for j in range(nX): if j not in delset: self.objsamplepnts_refcls[i].append(np.array(X[j])) self.objsamplenrmls_refcls[i].append( np.array(self.objsamplenrmls_ref[i][j])) # if self.objsamplepnts_refcls[i].size: # self.objsamplepnts_refcls[i] = np.vstack((self.objsamplepnts_refcls[i], X[j])) # self.objsamplenrmls_refcls[i] = np.vstack((self.objsamplenrmls_refcls[i], # self.objsamplenrmls_ref[i][j])) # else: # self.objsamplepnts_refcls[i] = np.array([]) # self.objsamplenrmls_refcls[i] = np.array([]) # self.objsamplepnts_refcls[i] = np.hstack((self.objsamplepnts_refcls[i], X[j])) # self.objsamplenrmls_refcls[i] = np.hstack((self.objsamplenrmls_refcls[i], # self.objsamplenrmls_ref[i][j])) delset.update(neigharrays[j].tolist()) if self.objsamplepnts_refcls[i]: self.objsamplepnts_refcls[i] = np.vstack( self.objsamplepnts_refcls[i]) self.objsamplenrmls_refcls[i] = np.vstack( self.objsamplenrmls_refcls[i]) else: self.objsamplepnts_refcls[i] = np.empty(shape=(0, 0)) self.objsamplenrmls_refcls[i] = np.empty(shape=(0, 0))
def clusterFacetSamplesRNN(self, reduceRadius=3): """ cluster the samples of each facet using radius nearest neighbours the cluster center and their correspondent normals will be saved in self.objsamplepnts_refcls and self.objsamplenrmals_refcls :param: reduceRadius: the neighbors that fall inside the reduceradius will be removed :return: None author: weiwei date: 20161130, osaka """ self.objsamplepnts_refcls = np.ndarray(shape=(self.facets.shape[0],), dtype=np.object) self.objsamplenrmls_refcls = np.ndarray(shape=(self.facets.shape[0],), dtype=np.object) for i, facet in enumerate(self.facets): # print "cluster" # print i,len(self.facets) self.objsamplepnts_refcls[i] = [] self.objsamplenrmls_refcls[i] = [] X = self.objsamplepnts_ref[i] nX = X.shape[0] if nX > 0: neigh = RadiusNeighborsClassifier(radius=1.0) neigh.fit(X, range(nX)) neigharrays = neigh.radius_neighbors(X, radius=reduceRadius, return_distance=False) delset = set([]) for j in range(nX): if j not in delset: self.objsamplepnts_refcls[i].append(np.array(X[j])) self.objsamplenrmls_refcls[i].append(np.array(self.objsamplenrmls_ref[i][j])) # if self.objsamplepnts_refcls[i].size: # self.objsamplepnts_refcls[i] = np.vstack((self.objsamplepnts_refcls[i], X[j])) # self.objsamplenrmls_refcls[i] = np.vstack((self.objsamplenrmls_refcls[i], # self.objsamplenrmls_ref[i][j])) # else: # self.objsamplepnts_refcls[i] = np.array([]) # self.objsamplenrmls_refcls[i] = np.array([]) # self.objsamplepnts_refcls[i] = np.hstack((self.objsamplepnts_refcls[i], X[j])) # self.objsamplenrmls_refcls[i] = np.hstack((self.objsamplenrmls_refcls[i], # self.objsamplenrmls_ref[i][j])) delset.update(neigharrays[j].tolist()) if self.objsamplepnts_refcls[i]: self.objsamplepnts_refcls[i] = np.vstack(self.objsamplepnts_refcls[i]) self.objsamplenrmls_refcls[i] = np.vstack(self.objsamplenrmls_refcls[i]) else: self.objsamplepnts_refcls[i] = np.empty(shape=(0,0)) self.objsamplenrmls_refcls[i] = np.empty(shape=(0,0))
def nncut_proc(distance, dt, dr, type): if dt.shape[0] == 0: return [dt, dr] nbrs = RadiusNeighborsClassifier().fit( dt, np.zeros_like(dr).reshape(dt.shape[0], )) colcnt = dt.shape[1] middle = nbrs.radius_neighbors(np.zeros(colcnt).reshape(1, colcnt), distance, return_distance=False) if type == 'inner': dt = dt.drop(dt.index[np.asarray(middle[0])]) dr = dr.drop(dr.index[np.asarray(middle[0])]) if type == 'outer': dt = dt[dt.index.isin(dt.index[np.asarray(middle[0])])] dr = dr[dr.index.isin(dr.index[np.asarray(middle[0])])] return [dt, dr]
def Classifier(train_size, new_classes, optimal_test, epsilon_choice, X_train_temp, X_test, Y_train_temp, alg): clf = RadiusNeighborsClassifier(radius=epsilon_choice, weights='distance').fit( X_train_temp, Y_train_temp) predict_set = clf.radius_neighbors(X_test[optimal_test].reshape(1, -1))[1] predict_set = list(predict_set[0]) if len(predict_set) > 0: if min(Y_train_temp[predict_set]) == max(Y_train_temp[predict_set]): return [min(Y_train_temp[predict_set]), predict_set] else: if alg == "srnc": y_predict = clf.predict(X_test[optimal_test].reshape(1, -1)) else: if alg == "svm": clf = svm.SVC().fit(X_train_temp[predict_set], Y_train_temp[predict_set]) if alg == "LinearSVC": clf = LinearSVC(max_iter=500000).fit( X_train_temp[predict_set], Y_train_temp[predict_set]) if alg == "sgd": clf = linear_model.SGDClassifier().fit( X_train_temp[predict_set], Y_train_temp[predict_set]) if alg == "dt": clf = DecisionTreeClassifier().fit( X_train_temp[predict_set], Y_train_temp[predict_set]) if alg == "rf": clf = RandomForestClassifier(n_estimators=10).fit( X_train_temp[predict_set], Y_train_temp[predict_set]) if alg == "gb": clf = GradientBoostingClassifier(n_estimators=10).fit( X_train_temp[predict_set], Y_train_temp[predict_set]) if alg == "lr": clf = LogisticRegression(max_iter=1000).fit( X_train_temp[predict_set], Y_train_temp[predict_set]) if alg == "mlp": clf = MLPClassifier().fit(X_train_temp[predict_set], Y_train_temp[predict_set]) y_predict = clf.predict(X_test[optimal_test].reshape(1, -1)) return [y_predict[0], predict_set] else: return [new_classes, predict_set]
def _transductive_classifier(self, X_train, y_train, test_instance): clf = RadiusNeighborsClassifier(radius=self.epsilon, weights='distance').fit( X_train, y_train) predict_set = clf.radius_neighbors(test_instance.reshape(1, -1))[1] predict_set = list(predict_set[0]) if len(predict_set) > 0: X_train_local, y_train_local = X_train[predict_set], y_train[ predict_set] if np.min(y_train_local) == np.max(y_train_local): prediction = y_train_local[0] else: clf = self._fit(X_train_local, y_train_local) if np.max(clf.predict_proba(test_instance.reshape( 1, -1))) < self.threshold_rejection: prediction = self.new_classes else: prediction = clf.predict(test_instance.reshape(1, -1))[0] else: prediction = self.new_classes return prediction
def nnradiussmooth(self, columns=None, rescolumn=None, distance=0.2, cycles=1): if columns == None: columns = range(0, self.dataset_width) colcnt = len(columns) dt = self.insample_data dataset = pd.DataFrame(dt.ix[:, columns]) nbrs = RadiusNeighborsClassifier().fit( dt, np.zeros_like(self.insample_res).reshape( self.insample_res.shape[0], )) nb = nbrs.radius_neighbors(dt, distance, return_distance=False) for i in range(0, cycles): dr = self.insample_res for x in nb: mn = self.insample_res.ix[x, 0].mean() dr.ix[x[0], 0] = dr.ix[x[0], 0] * 0.8 + mn * 0.2 self.insample_res = dr
print("Accuracy radius classifier") print(confusion_matrix(y_test, y_pred_radius)) print(classification_report(y_test, y_pred_radius)) y_pred_radius_for_one = classifier_radius.predict(new_X) print("radius prediction for one") print(y_pred_radius_for_one) print("Accuracy radius classifier for one") print(confusion_matrix(new_y, y_pred_radius_for_one)) print(classification_report(new_y, y_pred_radius_for_one)) radius_neighbors = classifier_radius.radius_neighbors(X=new_X, return_distance=True, sort_results=True) print("radius neighbors") print("The closest neighbors are ([distance, row_index])") print(radius_neighbors) for i in range(0, nr_of_neighbors): # the id of the neighboars: neighbors[1][0][i] print(data_df.iloc[radius_neighbors[1][0][i], :]) # graph = classifier.kneighbors_graph( # X=new_X, n_neighbors=nr_of_neighbors, mode='distance') # How to plot the graph? #plt.figure(figsize=(12, 6)) #plt.plot(graph.toarray(), new_X, color='red', linestyle='dashed', marker='o',)
class RadiusNeighborsModel(Classifier): """Classifier implementing a vote among neighbors within a given radius The radius Classifier predicting the labels by counting occurrences among the neighbors within a given radius r from a query example. In cases where the data is not uniformly sampled, radius-based neighbors classifier can be a better choice compared to k-nearest neighbors classifier. Points in sparser neighborhoods use fewer nearest neighbors for the classification For high-dimensional parameter spaces, this method becomes less effective due to the so-called “curse of dimensionality”. The choice of the radius is highly data-dependent, similarly to k in the k-nearest neighbors classifier. """ def __init__(self, radius=1.0, weights='uniform', p=2, metric='minkowski', ranking_size=30): """ :param radius: Range of parameter space to use by default for query example :param weights: The weight function used in prediction. Possible values: - 'uniform' : uniform weights. All points in each neighborhood are weighted equally. :param p: Power parameter for the Minkowski metric :param metric: The distance metric to use for the tree. The default metric is Minkowski, and with p=2 is equivalent to the standard Euclidean metric. Choices are: - 'euclidean' for standard Euclidean distance - 'manhattan': for the Manhattan distance - 'haversine' for distances between (latitude,longitude) points only - 'cosine': for cosinus similarity - 'minkowski': the Minkowski distance (euclidean if p=2) :param how_outliers: The way outlier samples (samples with no neighbors on given radius) are predicted. Possible values: - 'most_common' : return the most common labels in the training set - 'random' : return a random label ranking from the training set - [callable] : a user-defined function which accepts an example and returns a label ranking. """ self.radius = radius if weights != 'uniform': raise Exception("Only 'uniform' for the weights parameter is supported") self.weights = weights self.p = p self.metric = metric self.ranking_size = ranking_size # Scikit-learn Radius neighbors classifier self.clf = RadiusNeighborsClassifier(radius=radius, weights=weights, p=p, metric=metric, n_jobs=-1 ) def fit(self, X, y): super().fit(X, y) # The way outlier samples (samples with no neighbors on given radius) # are predicted is the following: predict only one label, the most # common one in the training set y_unique,counts = np.unique(y, return_counts=True) outlier_label = y_unique[np.argmax(counts)] self.outlier_label_ = outlier_label self.outlier_proba_ = np.max(counts)/len(y) def predict(self, X, return_proba=False): # check is fit had been called check_is_fitted(self, ['X_', 'y_']) # input validation X = check_array(X) # Compute neighbors indexes and distances for every test example # The result points are not necessarily sorted by distance to their # query point. neigh_distances, neigh_indexes = self.clf.radius_neighbors(X, return_distance=True) # neigh_argsorts = [np.argsort(ngh_dist) for ngh_dist in distances] y_predicted = list() y_predicted_probas = list() for indexes,distances in zip(neigh_indexes, neigh_distances): try: y_neigh = self.y_[indexes] except IndexError: y_predicted.append([self.outlier_label_]) y_predicted_probas.append([self.outlier_proba_]+[0. for k in range(self.ranking_size)]) continue y_unique, counts = np.unique(y_neigh, return_counts=True) # Get the most frequent labels from the neighbors # probability estimate probas = counts/len(y_neigh) # get the indexes of the sorted probabilities, in decreasing order top_predictions = np.flip(np.argsort(probas)[-self.ranking_size:],axis=0) y_pred = y_neigh[top_predictions] y_pred_probas = probas[top_predictions] if len(y_unique) < self.ranking_size: rank_probas = np.zeros(self.ranking_size) rank_probas[:len(y_unique)] = y_pred_probas y_pred_probas = rank_probas y_predicted.append(y_pred) y_predicted_probas.append(y_pred_probas) if return_proba: return np.array(y_predicted),np.array(y_predicted_probas) return np.array(y_predicted)
def SequentialRadiusNeighborsClassifier(epsilon, X_train, X_test, Y_train, add, alg): # size_train = len(Y_train) X_train_temp = np.copy(X_train) Y_train_temp = np.copy(Y_train) test_size = len(X_test) Y_predict = [-1 for x in range(test_size)] Y_current = list(set(Y_train)) test_index = [x for x in range(test_size)] new_indices = [] epsilon_update = epsilon # epsilon_update = updateEpsilon(distances, test_index, choice) for test_time in range(test_size): Knn_temp = NearestNeighbors(n_neighbors=1) Knn_temp.fit(X_train_temp) min_distances = Knn_temp.kneighbors(X_test[test_index])[0] min_distances = [np.mean(x) for x in min_distances] optimal_indice = min_distances.index(min(min_distances)) optimal_test = test_index[optimal_indice] clf = RadiusNeighborsClassifier(radius=epsilon_update, weights='distance').fit( X_train_temp, Y_train_temp) predict_set = clf.radius_neighbors(X_test[optimal_test].reshape(1, -1))[1] predict_set = list(predict_set[0]) if len(predict_set) > 0: if min(Y[predict_set]) == max(Y[predict_set]): y_predict = min(Y[predict_set]) else: if alg == "srnc": y_predict = clf.predict(X_test[optimal_test].reshape( 1, -1)) y_predict = y_predict[0] else: if alg == "svm": clf = svm.SVC().fit(X[predict_set], Y[predict_set]) if alg == "LinearSVC": clf = LinearSVC(max_iter=10000).fit( X[predict_set], Y[predict_set]) if alg == "dt": clf = DecisionTreeClassifier().fit( X[predict_set], Y[predict_set]) if alg == "rf": clf = RandomForestClassifier(n_estimators=10).fit( X[predict_set], Y[predict_set]) if alg == "gb": clf = GradientBoostingClassifier(n_estimators=10).fit( X[predict_set], Y[predict_set]) if alg == "lr": clf = LogisticRegression(max_iter=10000).fit( X[predict_set], Y[predict_set]) if alg == "mlp": clf = MLPClassifier().fit(X[predict_set], Y[predict_set]) y_predict = clf.predict(X_test[optimal_test].reshape( 1, -1)) y_predict = y_predict[0] if add == 1: X_train_temp = np.append(X_train_temp, [X_test[optimal_test]], axis=0) Y_train_temp = np.append(Y_train_temp, [y_predict], axis=0) else: y_predict = max(Y_current) + 1 Y_current.append(y_predict) X_train_temp = np.append(X_train_temp, [X_test[optimal_test]], axis=0) Y_train_temp = np.append(Y_train_temp, [y_predict], axis=0) new_indices.append(optimal_test) # epsilon_update = updateEpsilon(distances, test_index, choice) Y_predict[optimal_test] = y_predict test_index.remove(optimal_test) return Y_predict
random_state=0) ##建立 KNN Model knn_model = RadiusNeighborsClassifier(radius=2) ## 訓練model, fit進model,創建屬於這個數據集的KNN模型, fit參數接受 train data 是matrix,test data 是array ## 用訓練集來創進KNN model, ravel()將多維轉換成一維matrix knn_model.fit(X_train, y_train.values.ravel()) ## Step 4: radius_neighbors 和 radius_neighbors_graph 實作 ## a. radius_neighbors: 找到指定半徑下,一個或多個的近鄰,它會返回數據集中每個點的索引和距離值 ## b. radius_neighbors_graph: 計算x中的點與在指定半徑內的近鄰的加權圖 ## c. 步驟: 先指定一個或多個資料點,然後設定半徑,查看radius_neighbors與radius_neighbors_graph ## 找到指定半徑下,一個或多個的近鄰,它會返回數據集中每個點的索引和距離值 ## 指訂一個或多個點資料,我這邊隨便設定一個資料,必須先從list轉array,然後再.reshape(1, -1),才能使用 X = [5.8, 2.8, 3.8, 6] X = np.array(X).reshape(1, -1) RN = knn_model.radius_neighbors(X, radius=10) print(RN) print(np.asarray(RN[0][0])) # print(np.asarray(RN[1][2])) ## 計算x中的點與在指定半徑內的近鄰的加權圖 ## radius neighbors graph RNG = knn_model.radius_neighbors_graph(X, radius=10) print(RNG) print(RNG.toarray()) # ## 利用 test data裡的 X 來預測 y # print(knn_model.predict(X_test)) # ## 查看實際y # print(y_test.values.ravel()) # ## 這是 test data X 預測y是什麼的機率 # print(knn_model.predict_proba(X_test))