def test_radius_neighbors_boundary_handling(): X = [[0.999, 0.001], [0.5, 0.5], [0, 1.], [-1., 0.001]] n_points = len(X) # Build an exact nearest neighbors model as reference model to ensure # consistency between exact and approximate methods nnbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X) # Build a LSHForest model with hyperparameter values that always guarantee # exact results on this toy dataset. lsfh = ignore_warnings(LSHForest, category=DeprecationWarning)( min_hash_match=0, n_candidates=n_points, random_state=42).fit(X) # define a query aligned with the first axis query = [[1., 0.]] # Compute the exact cosine distances of the query to the four points of # the dataset dists = pairwise_distances(query, X, metric='cosine').ravel() # The first point is almost aligned with the query (very small angle), # the cosine distance should therefore be almost null: assert_almost_equal(dists[0], 0, decimal=5) # The second point form an angle of 45 degrees to the query vector assert_almost_equal(dists[1], 1 - np.cos(np.pi / 4)) # The third point is orthogonal from the query vector hence at a distance # exactly one: assert_almost_equal(dists[2], 1) # The last point is almost colinear but with opposite sign to the query # therefore it has a cosine 'distance' very close to the maximum possible # value of 2. assert_almost_equal(dists[3], 2, decimal=5) # If we query with a radius of one, all the samples except the last sample # should be included in the results. This means that the third sample # is lying on the boundary of the radius query: exact_dists, exact_idx = nnbrs.radius_neighbors(query, radius=1) approx_dists, approx_idx = lsfh.radius_neighbors(query, radius=1) assert_array_equal(np.sort(exact_idx[0]), [0, 1, 2]) assert_array_equal(np.sort(approx_idx[0]), [0, 1, 2]) assert_array_almost_equal(np.sort(exact_dists[0]), dists[:-1]) assert_array_almost_equal(np.sort(approx_dists[0]), dists[:-1]) # If we perform the same query with a slightly lower radius, the third # point of the dataset that lay on the boundary of the previous query # is now rejected: eps = np.finfo(np.float64).eps exact_dists, exact_idx = nnbrs.radius_neighbors(query, radius=1 - eps) approx_dists, approx_idx = lsfh.radius_neighbors(query, radius=1 - eps) assert_array_equal(np.sort(exact_idx[0]), [0, 1]) assert_array_equal(np.sort(approx_idx[0]), [0, 1]) assert_array_almost_equal(np.sort(exact_dists[0]), dists[:-2]) assert_array_almost_equal(np.sort(approx_dists[0]), dists[:-2])
class NNLR: def __init__(self, k=5, rad=2, mode='k', feat_names=None): self.mode = mode self.k = k self.NN = NearestNeighbors(k, radius=rad) def fit(self, X, Y): self.X = X self.Y = Y self.NN.fit(X) self.active=defaultdict(int) def nn_lin(self, testX, neighbors): l = DecisionTreeRegressor() return np.mean(self.Y[neighbors]) l.fit(self.X[neighbors], self.Y[neighbors]) # for idx in np.where(l.coef_)[0]: # self.active[idx]+=1 return l.predict([testX])[0] def predict(self, X): if self.mode == 'k': neighbors = self.NN.kneighbors(X)[1] elif self.mode == 'rad': neighbors = self.NN.radius_neighbors(X)[1] return np.array([self.nn_lin(Xtst, nbr) for (Xtst, nbr) in zip(X, neighbors)])
def _wpca_analysis(L, C, intensities): """ Determine the eccentricity of each cluster using weighted PCA (See Jolliffe 2002, 14.2.1). The smallest normalized explained variance is small for flat of filiform objects. - L is a numpy matrix (one point on each row) - intensities are gray levels of each point No cluster assignment is used here: a ball of radius 10 around each center is used to find the cloud of points. """ np.set_printoptions(threshold=50000) n_points, n_features = L.shape tee.log('WPCA - Fitting NearestNeighbors on', n_points, 'points') nbrs = NearestNeighbors(radius=10.0).fit(L) for i, c in enumerate(C): array_c = np.array([c.x, c.y, c.z]) i_nbrs = nbrs.radius_neighbors([array_c], 10.0, return_distance=False)[0] points_within = L[i_nbrs] if len(points_within) < 64: # too small set, there is no point in running PCA c.EVR = [0.499, 0.499, 0.002] c.last_variance = c.EVR[2] else: w = np.sqrt(intensities[i_nbrs]/255.0) wX = np.dot(np.diag(w), points_within) pca = sklearn.decomposition.PCA(n_components=3) X_r = pca.fit(wX).transform(wX) c.EVR = pca.explained_variance_ratio_ c.last_variance = c.EVR[2] print('WPCA done on', i, '/', len(C), 'name=', c.name, 'EVR=', c.EVR)
def test_radius_neighbors(): # Checks whether Returned distances are less than `radius` # At least one point should be returned when the `radius` is set # to mean distance from the considering point to other points in # the database. # Moreover, this test compares the radius neighbors of LSHForest # with the `sklearn.neighbors.NearestNeighbors`. n_samples = 12 n_features = 2 n_iter = 10 rng = np.random.RandomState(42) X = rng.rand(n_samples, n_features) lshf = ignore_warnings(LSHForest, category=DeprecationWarning)() # Test unfitted estimator assert_raises(ValueError, lshf.radius_neighbors, X[0]) ignore_warnings(lshf.fit)(X) for i in range(n_iter): # Select a random point in the dataset as the query query = X[rng.randint(0, n_samples)].reshape(1, -1) # At least one neighbor should be returned when the radius is the # mean distance from the query to the points of the dataset. mean_dist = np.mean(pairwise_distances(query, X, metric='cosine')) neighbors = lshf.radius_neighbors(query, radius=mean_dist, return_distance=False) assert_equal(neighbors.shape, (1,)) assert_equal(neighbors.dtype, object) assert_greater(neighbors[0].shape[0], 0) # All distances to points in the results of the radius query should # be less than mean_dist distances, neighbors = lshf.radius_neighbors(query, radius=mean_dist, return_distance=True) assert_array_less(distances[0], mean_dist) # Multiple points n_queries = 5 queries = X[rng.randint(0, n_samples, n_queries)] distances, neighbors = lshf.radius_neighbors(queries, return_distance=True) # dists and inds should not be 1D arrays or arrays of variable lengths # hence the use of the object dtype. assert_equal(distances.shape, (n_queries,)) assert_equal(distances.dtype, object) assert_equal(neighbors.shape, (n_queries,)) assert_equal(neighbors.dtype, object) # Compare with exact neighbor search query = X[rng.randint(0, n_samples)].reshape(1, -1) mean_dist = np.mean(pairwise_distances(query, X, metric='cosine')) nbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X) distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist) distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist)
def mean_shift(X, bandwidth, n_seeds, kernel_function='gaussian', max_iterations=100, proximity_thresh=5): ''' ---Parameters--- X : data in form (samples, dims) bandwidth : radius of nearest neighbors n_seeds : kernel_update_function : can be "gaussian" or "flat" or your own kernel proximity_thresh : minimum distance (in pixels) a new cluster must be away from previous ones ---Returns--- cluster_centers : cluster_counts : how many pixels are with the neighborhood of each cluster ''' import numpy as np from sklearn.neighbors import BallTree, NearestNeighbors from sklearn.utils import extmath from sklearn.metrics.pairwise import euclidean_distances from collections import defaultdict if kernel_function == 'gaussian': kernel_update_function = gaussian_kernel elif kernel_function == 'flat': kernel_update_function = flat_kernel else: kernel_update_function = kernel_function n_points, n_features = X.shape stop_thresh = 1e-2 * bandwidth # when mean has converged cluster_centers = [] cluster_counts = [] # ball_tree = BallTree(X)# to efficiently look up nearby points neighbors = NearestNeighbors(radius=bandwidth).fit(X) seeds = X[(np.random.uniform(0,X.shape[0], n_seeds)).astype(np.int)] # For each seed, climb gradient until convergence or max_iterations for weighted_mean in seeds: completed_iterations = 0 while True: points_within = X[neighbors.radius_neighbors([weighted_mean], bandwidth, return_distance=False)[0]] old_mean = weighted_mean # save the old mean weighted_mean = kernel_update_function(old_mean, points_within, bandwidth) converged = extmath.norm(weighted_mean - old_mean) < stop_thresh if converged or completed_iterations == max_iterations: # Only add cluster if it's different enough from other centers if len(cluster_centers) > 0: diff_from_prev = [np.linalg.norm(weighted_mean-cluster_centers[i], 2) for i in range(len(cluster_centers))] if np.min(diff_from_prev) > proximity_thresh: cluster_centers.append(weighted_mean) cluster_counts.append(points_within.shape[0]) else: cluster_centers.append(weighted_mean) cluster_counts.append(points_within.shape[0]) break completed_iterations += 1 return cluster_centers, cluster_counts
def test_radius_neighbors(): """Checks whether Returned distances are less than `radius` At least one point should be returned when the `radius` is set to mean distance from the considering point to other points in the database. Moreover, this test compares the radius neighbors of LSHForest with the `sklearn.neighbors.NearestNeighbors`. """ n_samples = 12 n_features = 2 n_iter = 10 rng = np.random.RandomState(42) X = rng.rand(n_samples, n_features) lshf = LSHForest() # Test unfitted estimator assert_raises(ValueError, lshf.radius_neighbors, X[0]) lshf.fit(X) for i in range(n_iter): query = X[rng.randint(0, n_samples)] mean_dist = np.mean(pairwise_distances(query, X, metric='cosine')) neighbors = lshf.radius_neighbors(query, radius=mean_dist, return_distance=False) # At least one neighbor should be returned. assert_greater(neighbors.shape[0], 0) # All distances should be less than mean_dist distances, neighbors = lshf.radius_neighbors(query, radius=mean_dist, return_distance=True) assert_array_less(distances[0], mean_dist) # Multiple points n_queries = 5 queries = X[rng.randint(0, n_samples, n_queries)] distances, neighbors = lshf.radius_neighbors(queries, return_distance=True) assert_equal(neighbors.shape[0], n_queries) assert_equal(distances.shape[0], n_queries) # dists and inds should not be 2D arrays assert_equal(distances.ndim, 1) assert_equal(neighbors.ndim, 1) # Compare with exact neighbor search query = X[rng.randint(0, n_samples)] mean_dist = np.mean(pairwise_distances(query, X, metric='cosine')) nbrs = NearestNeighbors(algorithm='brute', metric='cosine') nbrs.fit(X) distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist) distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist) # Distances of exact neighbors is less than or equal to approximate assert_true(np.all(np.less_equal(np.sort(distances_exact[0]), np.sort(distances_approx[0]))))
def compute(self): nn = NearestNeighbors(radius=self.eps, algorithm='auto', metric=self.metric).fit(self.x) self.distances, self.indices = nn.radius_neighbors(self.x, self.eps) print self.distances.shape, self.indices.shape for i in xrange(self.n): if not self.processed[i]: self.expand_cluster_order(i) assert self.ordered_file_index == self.n # print self.ordered_file self.draw_reachability_plot() return self.ordered_file
def build(self,tweets,minimalTermPerTweet=5, remove_noise_with_poisson_Law=False) : """ Return an upper sparse triangular matrix of similarity j>i """ timeThreshold=float(self.timeThreshold) distanceThreshold=float(self.distanceThreshold) useOnlyHashtags=self.useOnlyHashtags numberOfTweets=len(tweets) M=dok_matrix((numberOfTweets, numberOfTweets),dtype=np.float) print " Calculating TF-IDF vectors ..." TFIDFVectors,TweetPerTermMap=getTweetsTFIDFVectorAndNorm(tweets, minimalTermPerTweet=minimalTermPerTweet, remove_noise_with_poisson_Law=remove_noise_with_poisson_Law,useOnlyHashtags=useOnlyHashtags) print " Constructing similarity matrix ..." distanceThresholdInDegree=distanceThreshold/DEG_LATITUDE_IN_METER spatialIndex=NearestNeighbors(radius=distanceThresholdInDegree, algorithm='auto') spatialIndex.fit(np.array([(tweet.position.latitude,tweet.position.longitude) for tweet in tweets])) SHOW_RATE=100 for i in range(numberOfTweets) : if (i%SHOW_RATE==0) : print "\t",i,";" tweetI,TFIDFVectorI=tweets[i],TFIDFVectors[i] neighboors=set() #Recuperation des voisins par mots (les tweets ayant au moins un term en commun) TFIDFVectorIKeySet=set(TFIDFVectorI) for term in TFIDFVectorIKeySet : neighboors|=TweetPerTermMap[term] #Recuperation des voisins en espace (les tweets dans le voisinage self.distanceThreshold) position=np.array([tweetI.position.latitude,tweetI.position.longitude]).reshape(-1,2) neighboors&=set(spatialIndex.radius_neighbors(position)[1][0]) for j in neighboors : tweetJ=tweets[j] """ Ignorer les tweets qui ne sont pas apres le tweetI Ignorer les tweets qui ne sont pas dans le voisinage temporelle du tweetI """ if (j<=i or tweetJ.delay(tweetI)>self.timeThreshold) : continue TFIDFVectorJ=TFIDFVectors[j] TFIDFVectorJKeySet=set(TFIDFVectorJ) keysIntersection=TFIDFVectorIKeySet & TFIDFVectorJKeySet similarity=0 for term in keysIntersection : similarity+=TFIDFVectorI[term]*TFIDFVectorJ[term] M[i,j]=similarity return coo_matrix(M)
def getSim_dense(day, centroids, dataset, thred_radius_dist, vdw, rel_dw): print "## Begin calculating centroid dataset sim.", len(centroids), dataset.shape dataset_vdw = dataset[range(vdw[0], vdw[1]),:] if 1: nnModel = NearestNeighbors(radius=thred_radius_dist, algorithm='brute', metric='minkowski', p=2, n_jobs=1) num_centroids = len(centroids) #allData = np.append(centroids, dataset, axis=0) nnModel.fit(dataset) ngIdxArray = nnModel.radius_neighbors(centroids, thred_radius_dist, return_distance=False) if 0: ngIdxArray = [] for vecId, vec in enumerate(centroids):#.reshape(1, -1).tolist() distArr = euclidean_distances(np.array([vec]), dataset_vdw) nn_keys = [i+vdw[0] for i, eu in enumerate(distArr[0]) if eu <= thred_radius_dist] ngIdxArray.append(np.asarray(nn_keys, dtype=np.int32)) ngIdxArray = np.asarray(ngIdxArray) print "## nn cal completed", time.asctime() return ngIdxArray
def _finalize_masses(X, C, intensities): """ Regardless of the parameters of the algorithm, place a ball of radius 10 around each center and compute the mass in the ball. Rationale: thresholding for discriminating between centers and non centers should not depend on parameters used to seek the centers. This mass will be later used for the recall-precision curve. Hopefully wild variations of performance across different substacks will be reduced this way. """ n_points, n_features = X.shape tee.log('Finalizing masses - Fitting NearestNeighbors on', n_points, 'points') nbrs = NearestNeighbors(radius=10.0).fit(X) for c in C: array_c = np.array([c.x, c.y, c.z]) i_nbrs = nbrs.radius_neighbors([array_c], 10.0, return_distance=False)[0] points_within = X[i_nbrs] if len(points_within) == 0: break c.mass = sum(intensities[i_nbrs])
def _mi_dc(x, y, k): """ Calculates the mututal information between a continuous vector x and a disrete class vector y. This implementation can calculate the MI between the joint distribution of one or more continuous variables (X[:, 1:3]) with a discrete variable (y). Thanks to Adam Pocock, the author of the FEAST package for the idea. Brian C. Ross, 2014, PLOS ONE Mutual Information between Discrete and Continuous Data Sets """ y = y.flatten() n = x.shape[0] classes = np.unique(y) knn = NearestNeighbors(n_neighbors=k) # distance to kth in-class neighbour d2k = np.empty(n) # number of points within each point's class Nx = [] for yi in y: Nx.append(np.sum(y == yi)) # find the distance of the kth in-class point for c in classes: mask = np.where(y == c)[0] knn.fit(x[mask, :]) d2k[mask] = knn.kneighbors()[0][:, -1] # find the number of points within the distance of the kth in-class point knn.fit(x) m = knn.radius_neighbors(radius=d2k, return_distance=False) m = [i.shape[0] for i in m] # calculate MI based on Equation 2 in Ross 2014 MI = psi(n) - np.mean(psi(Nx)) + psi(k) - np.mean(psi(m)) return MI
class NNR(object): def __init__(self, r=1.0, k=10, def_mean=0, def_sd=float('inf'), off=0.1): self.reg = KRadiusNeighborsRegressor( n_neighbors=k, radius=r, defval=def_mean, weights=self.comp_weights) # Regresses squared error self.err_reg = NearestNeighbors(n_neighbors=k, radius=r) self.off = off self.def_sd = float(def_sd) def comp_weights(self, dists): return 1.0 / (dists + self.off) def fit(self, X, y): self.reg.fit(X, y) errs = y - self.reg.predict(X) self.sse = errs * errs self.err_reg.fit(X) def predict(self, X, return_std=False): if not return_std: return self.reg.predict(X) else: mean_val = self.reg.predict(X) sds = [] dists, inds = self.err_reg.radius_neighbors( X, return_distance=True) for d, i in zip(dists, inds): if len(d) < 2: sds.append(self.def_sd) continue else: errs = self.sse[i] weights = self.comp_weights(d) sd = np.average(errs, axis=0, weights=weights) / len(d) sds.append(sd) return mean_val, np.array(sds)
def dbscan(X, eps, min_samples, mode, visualize,metric='minkowski', algorithm='auto', leaf_size=30, p=2, random_state=None): """Perform DBSCAN clustering from vector array or distance matrix. Parameters ---------- X: array [n_samples, n_samples] or [n_samples, n_features] Array of distances between samples, or a feature array. The array is treated as a feature array unless the metric is given as 'precomputed'. eps: float, optional The maximum distance between two samples for them to be considered as in the same neighborhood. min_samples: int, optional The number of samples in a neighborhood for a point to be considered as a core point. metric: string, or callable The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by metrics.pairwise.calculate_distance for its metric parameter. If metric is "precomputed", X is assumed to be a distance matrix and must be square. algorithm: {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional The algorithm to be used by the NearestNeighbors module to compute pointwise distances and find nearest neighbors. See NearestNeighbors module documentation for details. leaf_size: int, optional (default = 30) Leaf size passed to BallTree or cKDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem. p: float, optional The power of the Minkowski metric to be used to calculate distance between points. random_state: numpy.RandomState, optional The generator used to initialize the centers. Defaults to numpy.random. Returns ------- core_samples: array [n_core_samples] Indices of core samples. labels : array [n_samples] Cluster labels for each point. Noisy samples are given the label -1. Notes ----- See examples/cluster/plot_dbscan.py for an example. References ---------- Ester, M., H. P. Kriegel, J. Sander, and X. Xu, “A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise”. In: Proceedings of the 2nd International Conference on Knowledge Discovery and Data Mining, Portland, OR, AAAI Press, pp. 226–231. 1996 """ if not eps > 0.0: raise ValueError("eps must be positive.") X = np.asarray(X) n = X.shape[0] # If index order not given, create random order. random_state = check_random_state(random_state) index_order = np.arange(n) random_state.shuffle(index_order) # check for known metric powers distance_matrix = True if metric == 'precomputed': D = pairwise_distances(X, metric=metric) else: distance_matrix = False neighbors_model = NearestNeighbors(radius=eps, algorithm=algorithm, leaf_size=leaf_size, metric=metric, p=p) neighbors_model.fit(X) # Calculate neighborhood for all samples. This leaves the original point # in, which needs to be considered later (i.e. point i is the # neighborhood of point i. While True, its useless information) neighborhoods = [] if distance_matrix: neighborhoods = [np.where(x <= eps)[0] for x in D] # Initially, all samples are noise. labels = -np.ones(n) # A list of all core samples found. core_samples = [] # label_num is the label given to the new cluster label_num = 0 # Look at all samples and determine if they are core. # If they are then build a new cluster from them. for index in index_order: # Already classified if labels[index] != -1: continue # get neighbors from neighborhoods or ballTree index_neighborhood = [] if distance_matrix: index_neighborhood = neighborhoods[index] else: index_neighborhood = neighbors_model.radius_neighbors( X[index], eps, return_distance=False)[0] # Too few samples to be core if len(index_neighborhood) < min_samples: continue core_samples.append(index) labels[index] = label_num # candidates for new core samples in the cluster. candidates = [index] while len(candidates) > 0: new_candidates = [] # A candidate is a core point in the current cluster that has # not yet been used to expand the current cluster. for c in candidates: c_neighborhood = [] if distance_matrix: c_neighborhood = neighborhoods[c] else: c_neighborhood = neighbors_model.radius_neighbors( X[c], eps, return_distance=False)[0] noise = np.where(labels[c_neighborhood] == -1)[0] noise = c_neighborhood[noise] labels[noise] = label_num for neighbor in noise: n_neighborhood = [] if distance_matrix: n_neighborhood = neighborhoods[neighbor] else: n_neighborhood = neighbors_model.radius_neighbors( X[neighbor], eps, return_distance=False)[0] # check if its a core point as well if len(n_neighborhood) >= min_samples: # is new core point new_candidates.append(neighbor) core_samples.append(neighbor) # Update candidates for next round of cluster expansion. candidates = new_candidates # Current cluster finished. # Next core point found will start a new cluster. label_num += 1 # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print('Estimated number of clusters: %d' % n_clusters_) # print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels)) ############################################################################## # Plot result # Black removed and is used for noise instead. human = np.zeros((1, len(labels)),int) #declare all points 0 unique_labels = set(labels) surface=[] #colors = pl.cm.Spectral(np.linspace(0, 1, len(unique_labels))) #ccnames=['grey', 'black', 'violet', 'blue', 'cyan', 'rosy', 'orange', 'red', 'green', 'brown', 'yellow', 'gold'] ccnames =['blue','green','red','cyan','magenta','yellow','black','white','grey'] cc = ['b','g','r','c','m','y','k','w','0.75' ] [xi,yi,zi] = [X[:,0] , X[:,1] , X[:,2]] [xmin, xmax] = [min(xi), max(xi)] [ymin, ymax] = [min(yi), max(yi)] [zmin, zmax] = [min(zi), max(zi)] [xnodes, ynodes, znodes] = [np.linspace(xmin, xmax, 20, endpoint=True), np.linspace(ymin, ymax, 20, endpoint=True), np.linspace(zmin, zmax, 20, endpoint=True)] #cc = [190 190 190, 0 0 0, 138 43 226, 0 0 255, 0 255 255, 255 193 193, 255 127 0, 255 0 0,0 255 0, 139 69 19, 255 255 0, 139 117 0]./255; for k, col in zip(unique_labels, cc): if k == -1: # Black used for noise. col = 'k' markersize = 6 class_members = [index[0] for index in np.argwhere(labels == k)] cluster_core_samples = [index for index in core_samples if labels[index] == k] for index in class_members: x = X[index] if index in core_samples and k != -1: markersize = 10 else: markersize = 6 pl.plot(x[1], x[2], 'o', markerfacecolor=col,markeredgecolor='k', markersize=markersize) pl.title('Estimated number of clusters: %d' % n_clusters_) pl.show() #plot figure #MANUALLY ANNOTATE DATA obj = 0 for obj in range(0,n_clusters_): filter=np.where(labels[:]==obj)[0] if mode==0: rospy.loginfo('Is %s human (1 for yes, 0 for no): ', ccnames[obj]) temp=input() for i in filter: human[0,i] = temp surface= my_griddata.griddata(yi[filter], zi[filter], xi[filter], ynodes, znodes,'nn') #surface=surface-min(min(surface)) rospy.loginfo('extract surface for cluster %d', obj) obj = obj + 1 pl.close() return core_samples, labels, n_clusters_, human, surface
def test_radius_neighbors(): # Checks whether Returned distances are less than `radius` # At least one point should be returned when the `radius` is set # to mean distance from the considering point to other points in # the database. # Moreover, this test compares the radius neighbors of LSHForest # with the `sklearn.neighbors.NearestNeighbors`. n_samples = 12 n_features = 2 n_iter = 10 rng = np.random.RandomState(42) X = rng.rand(n_samples, n_features) lshf = LSHForest() # Test unfitted estimator assert_raises(ValueError, lshf.radius_neighbors, X[0]) lshf.fit(X) for i in range(n_iter): # Select a random point in the dataset as the query query = X[rng.randint(0, n_samples)].reshape(1, -1) # At least one neighbor should be returned when the radius is the # mean distance from the query to the points of the dataset. mean_dist = np.mean(pairwise_distances(query, X, metric='cosine')) neighbors = lshf.radius_neighbors(query, radius=mean_dist, return_distance=False) assert_equal(neighbors.shape, (1, )) assert_equal(neighbors.dtype, object) assert_greater(neighbors[0].shape[0], 0) # All distances to points in the results of the radius query should # be less than mean_dist distances, neighbors = lshf.radius_neighbors(query, radius=mean_dist, return_distance=True) assert_array_less(distances[0], mean_dist) # Multiple points n_queries = 5 queries = X[rng.randint(0, n_samples, n_queries)] distances, neighbors = lshf.radius_neighbors(queries, return_distance=True) # dists and inds should not be 1D arrays or arrays of variable lengths # hence the use of the object dtype. assert_equal(distances.shape, (n_queries, )) assert_equal(distances.dtype, object) assert_equal(neighbors.shape, (n_queries, )) assert_equal(neighbors.dtype, object) # Compare with exact neighbor search query = X[rng.randint(0, n_samples)].reshape(1, -1) mean_dist = np.mean(pairwise_distances(query, X, metric='cosine')) nbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X) distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist) distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist) # Radius-based queries do not sort the result points and the order # depends on the method, the random_state and the dataset order. Therefore # we need to sort the results ourselves before performing any comparison. sorted_dists_exact = np.sort(distances_exact[0]) sorted_dists_approx = np.sort(distances_approx[0]) # Distances to exact neighbors are less than or equal to approximate # counterparts as the approximate radius query might have missed some # closer neighbors. assert_true(np.all(np.less_equal(sorted_dists_exact, sorted_dists_approx)))
def optimal_solution(): min_cost_flow = pywrapgraph.SimpleMinCostFlow() all_items = generate_all() drivers, driver_dict = all_items["drivers"] companies, company_dict = all_items["companies"] restaurants, restaurant_dict = all_items["restaurants"] orders, order_dict = all_items['orders'] drivers_path = {} order_len = len(orders) # generate arcs from source node (0) to company users (company_inc) # sets the cost to 0 and the capacity to 1 user_increment = 1 for x in range(order_len): min_cost_flow.AddArcWithCapacityAndUnitCost(0, user_increment, 1, 0) user_increment += 1 # generate arcs from male users (male_inc) to # sets the cost to 0 and the capacity to 1 # the loops also generates array for driver locations for later driver_locations = [] sink_index = len(companies) + len(drivers) + 1 for driver in drivers: driver_locations.append(driver.location_in_rad()) min_cost_flow.AddArcWithCapacityAndUnitCost(user_increment, sink_index, 1, 0) user_increment += 1 # goes through every company and generates the shortest path between that companies' # resturants. Get the shortest path starting at each restaurant, we use a near # neighbor search to determine which drivers are in range. After finding all drivers # in range, it only adds an edge for the shortest total distance for each driver. neigh = NearestNeighbors(metric="haversine", algorithm="ball_tree") neigh.fit(driver_locations) for order in orders: driver_info = [None] * len(drivers) order_index = orders.index(order) + 1 drivers_path[order_index] = {} shortest_paths_info = get_shortest_restaurant_paths( order.restaurants, order.company) for sp_info in shortest_paths_info: order_range = order.deadline - sp_info['distance'] if order_range < 0.0: continue order_range_rad = deg2rad(order_range * 0.008) start_location = sp_info['order'][0].location_in_rad() rng = neigh.radius_neighbors([start_location], radius=order_range_rad) indices = np.asarray(rng[1][0]) distances = np.asarray(rng[0][0]) for index in range(len(indices)): real_index = indices[index] driver_index = index + order_len + 1 distance = int( (distances[index] * EARTH_RADIUS + sp_info['distance']) * 1000) if driver_info[real_index] is None or driver_info[real_index][ 0] > distance: driver_info[real_index] = (distance, driver_index, sp_info) for d_info in driver_info: if d_info is None: continue drivers_path[order_index][d_info[1]] = d_info[2] min_cost_flow.AddArcWithCapacityAndUnitCost( order_index, d_info[1], 1, d_info[0]) # get the length of the limiting value, men or women. Set that to the supplies node_supplies = min(len(drivers), order_len) supplies = [node_supplies ] + ([0] * (order_len + len(drivers))) + [(-1) * node_supplies] for i in range(len(supplies)): min_cost_flow.SetNodeSupply(i, supplies[i]) # calculate the max flow with min cost and save information about matches final_matches = [None] * order_len successful_matches = 0.0 if min_cost_flow.SolveMaxFlowWithMinCost() == min_cost_flow.OPTIMAL: for arc in range(min_cost_flow.NumArcs()): if min_cost_flow.Tail(arc) != 0 and min_cost_flow.Head( arc) != sink_index: if min_cost_flow.Flow(arc) > 0: successful_matches += 1.0 tail = min_cost_flow.Tail(arc) head = min_cost_flow.Head(arc) orders[tail - 1].set_shortest_path( drivers_path[tail][head]) final_matches[min_cost_flow.Tail(arc) - 1] = (head - order_len, min_cost_flow.UnitCost(arc) / 1000.0) # print results driver_distance = int(min_cost_flow.OptimalCost()) percent_matched = 0 if order_len == 0 else (successful_matches / order_len * 100) # print_data(final_matches, orders, percent_matched, driver_distance) # print(int(successful_matches), 0, int(successful_matches), 0) # return (int(successful_matches), 0, int(successful_matches), 0) return final_matches
def mean_shift_cosine(X, bandwidth=None, seeds=None, bin_seeding=False, min_bin_freq=1, cluster_all=True, max_iter=300, n_jobs=None): """Perform mean shift clustering of data using a flat kernel. Read more in the :ref:`User Guide <mean_shift>`. Parameters ---------- X : array-like, shape=[n_samples, n_features] Input data. bandwidth : float, optional Kernel bandwidth. If bandwidth is not given, it is determined using a heuristic based on the median of all pairwise distances. This will take quadratic time in the number of samples. The sklearn.cluster.estimate_bandwidth function can be used to do this more efficiently. seeds : array-like, shape=[n_seeds, n_features] or None Point used as initial kernel locations. If None and bin_seeding=False, each data point is used as a seed. If None and bin_seeding=True, see bin_seeding. bin_seeding : boolean, default=False If true, initial kernel locations are not locations of all points, but rather the location of the discretized version of points, where points are binned onto a grid whose coarseness corresponds to the bandwidth. Setting this option to True will speed up the algorithm because fewer seeds will be initialized. Ignored if seeds argument is not None. min_bin_freq : int, default=1 To speed up the algorithm, accept only those bins with at least min_bin_freq points as seeds. cluster_all : boolean, default True If true, then all points are clustered, even those orphans that are not within any kernel. Orphans are assigned to the nearest kernel. If false, then orphans are given cluster label -1. max_iter : int, default 300 Maximum number of iterations, per seed point before the clustering operation terminates (for that seed point), if has not converged yet. n_jobs : int or None, optional (default=None) The number of jobs to use for the computation. This works by computing each of the n_init runs in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary <n_jobs>` for more details. .. versionadded:: 0.17 Parallel Execution using *n_jobs*. Returns ------- cluster_centers : array, shape=[n_clusters, n_features] Coordinates of cluster centers. labels : array, shape=[n_samples] Cluster labels for each point. Notes ----- For an example, see :ref:`examples/cluster/plot_mean_shift.py <sphx_glr_auto_examples_cluster_plot_mean_shift.py>`. """ if bandwidth is None: bandwidth = estimate_bandwidth(X, n_jobs=n_jobs) elif bandwidth <= 0: raise ValueError("bandwidth needs to be greater than zero or None,\ got %f" % bandwidth) if seeds is None: if bin_seeding: seeds = get_bin_seeds(X, bandwidth, min_bin_freq) else: seeds = X n_samples, n_features = X.shape center_intensity_dict = {} nbrs = NearestNeighbors(radius=bandwidth, n_jobs=n_jobs, metric='cosine').fit(X) # execute iterations on all seeds in parallel all_res = Parallel(n_jobs=n_jobs)( delayed(_mean_shift_cosine_single_seed)(seed, X, nbrs, max_iter) for seed in seeds) # copy results in a dictionary for i in range(len(seeds)): if all_res[i] is not None: center_intensity_dict[all_res[i][0]] = all_res[i][1] if not center_intensity_dict: # nothing near seeds raise ValueError("No point was within bandwidth=%f of any seed." " Try a different seeding strategy \ or increase the bandwidth." % bandwidth) # POST PROCESSING: remove near duplicate points # If the distance between two kernels is less than the bandwidth, # then we have to remove one because it is a duplicate. Remove the # one with fewer points. sorted_by_intensity = sorted(center_intensity_dict.items(), key=lambda tup: (tup[1], tup[0]), reverse=True) sorted_centers = np.array([tup[0] for tup in sorted_by_intensity]) unique = np.ones(len(sorted_centers), dtype=np.bool) nbrs = NearestNeighbors(radius=bandwidth, n_jobs=n_jobs, metric='cosine').fit(sorted_centers) for i, center in enumerate(sorted_centers): if unique[i]: neighbor_idxs = nbrs.radius_neighbors([center], return_distance=False)[0] unique[neighbor_idxs] = 0 unique[i] = 1 # leave the current point as unique cluster_centers = sorted_centers[unique] # ASSIGN LABELS: a point belongs to the cluster that it is closest to nbrs = NearestNeighbors(n_neighbors=1, n_jobs=n_jobs, metric='cosine').fit(cluster_centers) labels = np.zeros(n_samples, dtype=np.int) distances, idxs = nbrs.kneighbors(X) if cluster_all: labels = idxs.flatten() else: labels.fill(-1) bool_selector = distances.flatten() <= bandwidth labels[bool_selector] = idxs.flatten()[bool_selector] return cluster_centers, labels
def getPoi(self): self.poi = [] # ---- Step 1 : Get events near to each stop point ---------------------------------------------------# associatedVisits = [] neighboorsServerAll = NearestNeighbors(radius=self.distanceThres, algorithm="auto", leaf_size=10) neighboorsServerAll.fit(np.array([event.position for event in self.events])) for i in range(len(self.stops)): associatedVisits.append(set(neighboorsServerAll.radius_neighbors(self.stops[i].position)[1][0])) # ----------------------------------------------------------------------------------------------------# # ---- Step 2 : Merge stop points onto poi and merge there visits ------------------------------------# aggregatedPOI = np.array([-1] * len(self.stops)) numberOfPOI = 0 for i in range(len(self.stops)): inIntersection = [] for j in range(len(self.stops)): if associatedVisits[i] & associatedVisits[j]: inIntersection.append(j) poiId = min(aggregatedPOI[inIntersection]) if poiId == -1: numberOfPOI += 1 poiId = numberOfPOI aggregatedPOI[inIntersection] = poiId listPoiandNeighboorsIndex = [] for poiId in set(aggregatedPOI): members = self.stops[aggregatedPOI == poiId] indices = np.array(range(len(self.stops)))[aggregatedPOI == poiId] neighboors = set() for i in indices: neighboors |= associatedVisits[i] poi = Poi( poiId, sum([p.longitude() for p in members]) / len(members), sum([p.latitude() for p in members]) / len(members), ) listPoiandNeighboorsIndex.append((poi, sorted(neighboors))) # ----------------------------------------------------------------------------------------------------# # ---- Step 3 : Creating the list of Poi -------------------------------------------------------------# infrequentVisits = [] for item in listPoiandNeighboorsIndex: poi, neighborsIndex = item j = neighborsIndex[0] for i in range(1, len(neighborsIndex)): if neighborsIndex[i] > neighborsIndex[i - 1] + 1: visit = Visit(-1, self.events[j].time, self.events[neighborsIndex[i - 1] + 1].time) j = neighborsIndex[i] if visit.duration() >= self.stayTimeThres: poi.addVisit(visit) k = neighborsIndex[-1] + 1 if (neighborsIndex[-1] + 1 < len(self.events)) else neighborsIndex[-1] visit = Visit(-1, self.events[j].time, self.events[k].time) if visit.duration() >= self.stayTimeThres: poi.addVisit(visit) if poi.visits: if len(poi.visits) >= self.freqThres: self.poi.append(poi) else: infrequentVisits.extend(poi.visits) if infrequentVisits: self.poi.append( Poi("I", float("nan"), float("nan"), sorted(infrequentVisits, key=lambda visit: visit.arrival)) ) # ----------------------------------------------------------------------------------------------------# self.finalize(self.poi, mergeVisits=False) return self.poi
test_calc_polarity[i] = 1 elif(test_calc_polarity[i]==2): test_calc_polarity[i]=0 ''' from sklearn.neighbors import NearestNeighbors import warnings warnings.filterwarnings("ignore") train_calc_polarity = [] neigh = NearestNeighbors(30, radius =0.5, metric='cosine',algorithm='brute') neigh.fit(Embedding_reduced_train) #lshf = LSHForest(random_state=42,n_neighbors=30, radius=0.5) #lshf.fit(X_train) for i in range(len(probability)): nbrs = neigh.radius_neighbors(Embedding_reduced_train[i], return_distance=False) #nbrs = lshf.kneighbors(Embedding_reduced_train[i], return_distance=False) tempdd = 0 for j in range(len(nbrs[0])): tempdd = tempdd+probability[nbrs[0][j]] tempdd = tempdd/len(nbrs[0]) probability[i] = tempdd inde = probability[i].argsort()[-2:][::-1] if((probability[i][inde[0]]-probability[i][inde[1]])<0): train_calc_polarity.append(0) else: if(inde[0]==0): train_calc_polarity.append(1) elif(inde[0]==1): train_calc_polarity.append(-1)
def mean_shift(X, intensities=None, bandwidth=None, seeds=None, cluster_all=True, max_iterations=300, verbose=False, use_scipy=True): """mean_shift(X, intensities=None, bandwidth=None, seeds=None, cluster_all=True, max_iterations=300, verbose=False, use_scipy=True) Mean shift algorithm Implementation taken from scikit-learn with two minor variants: - Use (by default) scipy KD-trees, which are faster in our case - weigthed version of mean-shift using `intensities` as weights (i.e., we compute centers of mass rather than means) Parameters ---------- X : array-like, shape=[n_samples, n_features] Input data. intensities : array-like, shape=[n_samples] Voxel intensities, used to weight the mean bandwidth : float Kernel bandwidth. seeds : array-like, shape=[n_seeds, n_features] Point used as initial kernel locations. use_scipy : bool If true use cKDTree from scipy.spatial, otherwise use NearestNeighbors from sklearn.neighbors Returns ------- cluster_centers : array, shape=[n_clusters, n_features] Coordinates of cluster centers. labels : array, shape=[n_samples] Cluster labels for each point. volumes : array, shape=[n_clusters] Volume of each cluster (# of points in the cluster) masses : array, shape=[n_clusters] Mass of each cluster (sum of intensities of points in the cluster). trajectories : list MS trajectories for debugging purposes. """ if seeds is None: seeds = X n_points, n_features = X.shape stop_thresh = 1e-3 * bandwidth # when mean has converged center_volume_dict = {} center_mass_dict = {} # tee.log('Fitting NearestNeighbors on', n_points, 'points') if use_scipy: kdtree = cKDTree(X) else: nbrs = NearestNeighbors(radius=bandwidth).fit(X) # For each seed, climb gradient until convergence or max_iterations trajectories = {} # for each seed, a list of points tee.log('Moving kernels for', len(seeds), 'seeds') pbar = pb.ProgressBar( widgets=['Moving %d seeds: ' % len(seeds), pb.Percentage()], maxval=len(seeds)).start() for seed_no, my_mean in enumerate(seeds): completed_iterations = 0 seed = my_mean trajectories[seed_no] = [] while True: # Find mean of points within bandwidth if use_scipy: i_nbrs = kdtree.query_ball_point(my_mean, r=bandwidth) else: i_nbrs = nbrs.radius_neighbors([my_mean], bandwidth, return_distance=False)[0] points_within = X[i_nbrs] if len(points_within) == 0: break # Depending on seeding strategy this condition may occur my_old_mean = my_mean # save the old mean if intensities is None: my_mean = np.mean(points_within, axis=0) else: my_mean = np.average(points_within, axis=0, weights=intensities[i_nbrs]) # If converged or at max_iterations, addS the cluster if extmath.norm( my_mean - my_old_mean ) < stop_thresh or completed_iterations == max_iterations: center_volume_dict[tuple(my_mean)] = len(points_within) center_mass_dict[tuple(my_mean)] = sum(intensities[i_nbrs]) break completed_iterations += 1 trajectories[seed_no].append(my_mean) if verbose: print('seed', seed, '-->', my_mean, center_volume_dict[tuple(my_mean)], center_mass_dict[tuple(my_mean)], completed_iterations) pbar.update(seed_no + 1) pbar.finish() # POST PROCESSING: remove near duplicate points # If the distance between two kernels is less than the bandwidth, # then we have to remove one because it is a duplicate. Remove the # one with fewer points. sorted_by_intensity = sorted(center_mass_dict.items(), key=lambda tup: tup[1], reverse=True) sorted_centers = np.array([tup[0] for tup in sorted_by_intensity]) unique = np.ones(len(sorted_centers), dtype=np.bool) print('started from', len(seeds), 'seeds, now |unique|=', len(unique)) # print('|center_mass_dict|=', len(center_mass_dict)) if len(center_mass_dict) == 0: tee.log('No valid seeds. Giving up') return None, None, None, None, None nbrs = NearestNeighbors(radius=bandwidth).fit(sorted_centers) for i, center in enumerate(sorted_centers): if unique[i]: neighbor_idxs = nbrs.radius_neighbors([center], return_distance=False)[0] unique[neighbor_idxs] = 0 unique[i] = 1 # leave the current point as unique cluster_centers = sorted_centers[unique] print('|cluster_centers|=', len(cluster_centers)) volumes = [0] * len(cluster_centers) masses = [0] * len(cluster_centers) for i, c in enumerate(cluster_centers): volumes[i] = center_volume_dict[tuple(c)] masses[i] = center_mass_dict[tuple(c)] # ASSIGN LABELS: a point belongs to the cluster that it is closest to nbrs = NearestNeighbors(n_neighbors=1).fit(cluster_centers) labels = np.zeros(n_points, dtype=np.int) distances, idxs = nbrs.kneighbors(X) if cluster_all: labels = idxs.flatten() else: labels[:] = -1 bool_selector = distances.flatten() <= bandwidth labels[bool_selector] = idxs.flatten()[bool_selector] return cluster_centers, labels, volumes, masses, trajectories
def find_recommendations(self, tweets=[], top=10, quality=.1, min_examples=1): working_list = [] result_list = [] try: config.LOGGER.info('Generating content recommendations for user %s', self.account['profile']['preferredUsername']) if self.svd is not None: if len(tweets) < top: config.LOGGER.debug("Too few tweets passed for recommendation") return [] #tokenized_tweets = [' '.join(doc['newKeys']) for doc in tweets] #tweetText = [tw['text'] for tw in tweets] tweetText = [' '.join(tw['keywords']) for tw in tweets] Y = self.vectorizer.transform(tweetText) svdY = self.svd.transform(Y) svdY = self.normalizer.transform(svdY) y_transform = self.k_means.transform(svdY) # terms = self.vectorizer.get_feature_names() selected_updates = [] y_predict = self.k_means.predict(svdY) for i in range(self.cluster_count): cluster_distance = [] for j in range(len(y_predict)): if y_predict[j] == i and sum(svdY[j]) != 0.0: cluster_distance.append( {'index': j, 'cluster': i, 'dist': np.sqrt(sum([y * y for y in y_transform[j]]))}) newlist = sorted(cluster_distance, key=operator.itemgetter('dist'), reverse=False) selected_updates.append(newlist) temp = [entry for entry in it.izip_longest(*selected_updates)] clean_list = filter(lambda x: x is not None, [entry for tuple in temp for entry in tuple])[0:top] clean_list_svdY = [svdY[entry['index']] for entry in clean_list] config.LOGGER.debug("Found %i possible matches in topic clusters " % len(clean_list_svdY)) neigh = NearestNeighbors() neigh.fit(self.svdX) if len(clean_list_svdY) > 0: distances, svd_neighbors = neigh.radius_neighbors(X=clean_list_svdY, radius=quality) else: svd_neighbors =[] examples=[] for idx, entry in enumerate(svd_neighbors): if len(entry) >= min_examples: config.LOGGER.debug("Suggested tweet has %d examples" % len(entry)) original = tweets[clean_list[idx]['index']]['text'] for jdx, neighbor in enumerate(entry): examples.append({'text':self.training_docs[neighbor]['text'], 'dist':distances[idx][jdx]}) sorted_examples = sorted(examples, key=operator.itemgetter('dist'), reverse=False) min_examples = [item['text'] for item in sorted_examples][:min_examples] t1 = self.training_docs[self.all_cluster_dist[clean_list[idx]['cluster']][0]['index']]['text'] t2 = self.training_docs[self.all_cluster_dist[clean_list[idx]['cluster']][1]['index']]['text'] working_list.append({"dist": sorted_examples[0]['dist'], "text": original, "id": str(tweets[clean_list[idx]['index']]['_id']), "sender": str(tweets[clean_list[idx]['index']]['sender']), 'samples_svd': min_examples, 'samples_cluster':[t1,t2]}) result_list = sorted(working_list, key=operator.itemgetter('dist'), reverse=False) return result_list[:top] except Exception as ex: config.LOGGER.error("Error %s computing recommendations for mission %s", ex.message, self.missionId) return []
def mean_shift(X, bandwidth, max_iter): (m,n) = X.shape print m,n graph = tf.Graph() with graph.as_default(): with tf.name_scope("input") as scope: data = tf.constant(X, name="data_points") b = tf.constant(bandwidth,dtype=tf.float32, name="bandwidth") m = tf.constant(max_iter, name="maximum_iteration") # n_samples = tf.constant(m, name="no_of_samples") # n_features = tf.constant(n, name="no_of_features") # with tf.name_scope("seeding") as scope: # seed = tf.placeholder(tf.float32, [5], name="seed") with tf.name_scope("mean_shifting") as scope: old_mean = tf.placeholder(tf.float32, [n], name="old_mean") neighbors = tf.placeholder(tf.float32, [None,n], name="neighbors") new_mean = tf.reduce_mean(neighbors,0) euclid_dist = tf.sqrt(tf.reduce_sum(tf.pow(tf.sub(old_mean, new_mean), 2)), name="mean_distance") center_intensity_dict = {} nbrs = NearestNeighbors(radius=bandwidth).fit(X) sess = tf.Session() init = tf.initialize_all_variables() sess.run(init) writer = tf.train.SummaryWriter(FLAGS.log_dir, sess.graph_def) bin_sizes = defaultdict(int) data_point = tf.placeholder(tf.float32, [n],"data_point") binned_point = tf.floordiv(data_point,b) for point in X: feed={data_point:point} bp = sess.run(binned_point,feed_dict=feed) bin_sizes[tuple(bp)] +=1 bin_seeds = np.array([point for point, freq in six.iteritems(bin_sizes) if freq >= 1], dtype=np.float32) bin_seeds = bin_seeds*bandwidth print len(bin_seeds) j=0 for x in bin_seeds: print "Seed ",j,": ",x i = 0 o_mean=x while True: i_nbrs = nbrs.radius_neighbors([o_mean], bandwidth, return_distance=False)[0] points_within = X[i_nbrs] feed = {neighbors: points_within} n_mean = sess.run(new_mean, feed_dict=feed) feed = {new_mean: n_mean, old_mean: o_mean} dist = sess.run(euclid_dist, feed_dict=feed) if dist < 1e-3*bandwidth or i==max_iter: center_intensity_dict[tuple(n_mean)] = len(i_nbrs) break else: o_mean = n_mean print "\t",i,dist,len(i_nbrs) i+=1 # if j>10: # break j+=1 print center_intensity_dict sorted_by_intensity = sorted(center_intensity_dict.items(),key=lambda tup: tup[1], reverse=True) sorted_centers = np.array([tup[0] for tup in sorted_by_intensity]) unique = np.ones(len(sorted_centers), dtype=np.bool) nbrs = NearestNeighbors(radius=bandwidth).fit(sorted_centers) for i, center in enumerate(sorted_centers): if unique[i]: neighbor_idxs = nbrs.radius_neighbors([center],return_distance=False)[0] unique[neighbor_idxs] = 0 unique[i] = 1 # leave the current point as unique cluster_centers = sorted_centers[unique] nbrs = NearestNeighbors(n_neighbors=1).fit(cluster_centers) labels = np.zeros(154401, dtype=np.int) distances, idxs = nbrs.kneighbors(X) labels = idxs.flatten() return cluster_centers, labels
def k_nearest_neighbors(coordinates, neighbor_cutoff, max_num_neighbors=None, p_distance=2, self_loops=False): """Find k nearest neighbors for each atom We do not guarantee that the edges are sorted according to the distance between atoms. Parameters ---------- coordinates : numpy.ndarray of shape (N, D) The coordinates of atoms in the molecule. N for the number of atoms and D for the dimensions of the coordinates. neighbor_cutoff : float If the distance between a pair of nodes is larger than neighbor_cutoff, they will not be considered as neighboring nodes. max_num_neighbors : int or None. If not None, then this specifies the maximum number of neighbors allowed for each atom. Default to None. p_distance : int We compute the distance between neighbors using Minkowski (:math:`l_p`) distance. When ``p_distance = 1``, Minkowski distance is equivalent to Manhattan distance. When ``p_distance = 2``, Minkowski distance is equivalent to the standard Euclidean distance. Default to 2. self_loops : bool Whether to allow a node to be its own neighbor. Default to False. Returns ------- srcs : list of int Source nodes. dsts : list of int Destination nodes, corresponding to ``srcs``. distances : list of float Distances between the end nodes, corresponding to ``srcs`` and ``dsts``. Examples -------- >>> from dgllife.utils import get_mol_3d_coordinates, k_nearest_neighbors >>> from rdkit import Chem >>> from rdkit.Chem import AllChem >>> mol = Chem.MolFromSmiles('CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C') >>> AllChem.EmbedMolecule(mol) >>> AllChem.MMFFOptimizeMolecule(mol) >>> coords = get_mol_3d_coordinates(mol) >>> srcs, dsts, dists = k_nearest_neighbors(coords, neighbor_cutoff=1.25) >>> print(srcs) [8, 7, 11, 10, 20, 19] >>> print(dsts) [7, 8, 10, 11, 19, 20] >>> print(dists) [1.2084666104583117, 1.2084666104583117, 1.226457824344217, 1.226457824344217, 1.2230522248065987, 1.2230522248065987] See Also -------- get_mol_3d_coordinates mol_to_nearest_neighbor_graph smiles_to_nearest_neighbor_graph """ num_atoms = coordinates.shape[0] model = NearestNeighbors(radius=neighbor_cutoff, p=p_distance) model.fit(coordinates) dists_, nbrs = model.radius_neighbors(coordinates) srcs, dsts, dists = [], [], [] for i in range(num_atoms): dists_i = dists_[i].tolist() nbrs_i = nbrs[i].tolist() if not self_loops: dists_i.remove(0) nbrs_i.remove(i) if max_num_neighbors is not None and len(nbrs_i) > max_num_neighbors: packed_nbrs = list(zip(dists_i, nbrs_i)) # Sort neighbors based on distance from smallest to largest packed_nbrs.sort(key=lambda tup: tup[0]) dists_i, nbrs_i = map(list, zip(*packed_nbrs)) dsts.extend([i for _ in range(max_num_neighbors)]) srcs.extend(nbrs_i[:max_num_neighbors]) dists.extend(dists_i[:max_num_neighbors]) else: dsts.extend([i for _ in range(len(nbrs_i))]) srcs.extend(nbrs_i) dists.extend(dists_i) return srcs, dsts, dists
def k_nearest_neighbors(coordinates, neighbor_cutoff, max_num_neighbors=None, p_distance=2, self_loops=False): """Find k nearest neighbors for each atom We do not guarantee that the edges are sorted according to the distance between atoms. Parameters ---------- coordinates : numpy.ndarray of shape (N, D) The coordinates of atoms in the molecule. N for the number of atoms and D for the dimensions of the coordinates. neighbor_cutoff : float If the distance between a pair of nodes is larger than neighbor_cutoff, they will not be considered as neighboring nodes. max_num_neighbors : int or None. If not None, then this specifies the maximum number of neighbors allowed for each atom. Default to None. p_distance : int We compute the distance between neighbors using Minkowski (:math:`l_p`) distance. When ``p_distance = 1``, Minkowski distance is equivalent to Manhattan distance. When ``p_distance = 2``, Minkowski distance is equivalent to the standard Euclidean distance. Default to 2. self_loops : bool Whether to allow a node to be its own neighbor. Default to False. Returns ------- srcs : list of int Source nodes. dsts : list of int Destination nodes, corresponding to ``srcs``. distances : list of float Distances between the end nodes, corresponding to ``srcs`` and ``dsts``. """ num_atoms = coordinates.shape[0] model = NearestNeighbors(radius=neighbor_cutoff, p=p_distance) model.fit(coordinates) dists_, nbrs = model.radius_neighbors(coordinates) srcs, dsts, dists = [], [], [] for i in range(num_atoms): dists_i = dists_[i].tolist() nbrs_i = nbrs[i].tolist() if not self_loops: dists_i.remove(0) nbrs_i.remove(i) if max_num_neighbors is not None and len(nbrs_i) > max_num_neighbors: packed_nbrs = list(zip(dists_i, nbrs_i)) # Sort neighbors based on distance from smallest to largest packed_nbrs.sort(key=lambda tup: tup[0]) dists_i, nbrs_i = map(list, zip(*packed_nbrs)) dsts.extend([i for _ in range(max_num_neighbors)]) srcs.extend(nbrs_i[:max_num_neighbors]) dists.extend(dists_i[:max_num_neighbors]) else: dsts.extend([i for _ in range(len(nbrs_i))]) srcs.extend(nbrs_i) dists.extend(dists_i) return srcs, dsts, dists
def mean_shift_clustering(data, bandwidth=0.7, min_bin_freq=5, max_iter=300): """pipline of mean shift clustering. Args: data (np.ndarray) : Input data with shape (n_samples, n_features) bandwidth (float) : Bandwidth parameter for mean shift algorithm. min_bin_freq(int) : Parameter for get_bin_seeds function. For each bin_seed, number of the minimal points should cover. max_iter (int) : Max iteration for mean shift. Returns: labels (np.ndarray) : Input/output integer array that stores the cluster indices for every sample. The shape is (n_samples, 1) centers (np.ndarray) : Output matrix of the cluster centers, one row per each cluster center. The shape is (k, n_features) """ start = time() n_jobs = None seeds = get_bin_seeds(data, bandwidth, min_bin_freq) n_samples, n_features = data.shape center_intensity_dict = {} # We use n_jobs=1 because this will be used in nested calls under # parallel calls to _mean_shift_single_seed so there is no need for # for further parallelism. nbrs = NearestNeighbors(radius=bandwidth, n_jobs=1).fit(data) # execute iterations on all seeds in parallel all_res = Parallel(n_jobs=n_jobs)( delayed(mean_shift_single_seed) (seed, data, nbrs, max_iter) for seed in seeds) # copy results in a dictionary for i in range(len(seeds)): if all_res[i] is not None: center_intensity_dict[all_res[i][0]] = all_res[i][1] if not center_intensity_dict: # nothing near seeds raise ValueError("No point was within bandwidth=%f of any seed." " Try a different seeding strategy \ or increase the bandwidth." % bandwidth) """ YOUR CODE STARTS HERE """ # get all peaks of windows peaks = np.array(list(center_intensity_dict.keys())) # construct class for peaks nbrs_peaks = NearestNeighbors(radius=bandwidth, n_jobs=1).fit(peaks) centers = set() for p in peaks: # find peaks within bandwidth nb = nbrs_peaks.radius_neighbors(np.expand_dims(p, axis=0)) indices = nb[1][0] # if more than 1 peak if len(indices) > 1: keys = peaks[indices] duplicate_peaks = { tuple(k): center_intensity_dict[tuple(k)] for k in keys } max_key = tuple(max(duplicate_peaks, key=duplicate_peaks.get)) else: max_key = tuple(p) centers.add(max_key) centers = np.array(list(centers)) # assign points to nearest cluster peak nbrs_centers = NearestNeighbors(n_neighbors=1, n_jobs=1).fit(centers) d, labels = nbrs_centers.kneighbors(data) """ YOUR CODE ENDS HERE """ end = time() kmeans_runtime = end - start print("mean shift running time: %.3fs."% kmeans_runtime) return labels, centers
class AnomalyModel: def __init__(self, trainingSet, anomalyMethod = "KNN", h = None ): self.method = anomalyMethod if self.method == "online": self.h = h if self.method == "centroid": self.h = Util.centroid( trainingSet ) if self.method == "medoid": self.h = Util.medoid( trainingSet ) if self.method == "IGNG": self.h = IGNG( radius = PARAMS["R"] ) # IGNG.estimate_radius( trainingSet ) self.h.train( trainingSet ) # print len( self.h.get_nodes_positions() ), len(trainingSet) if self.method == "GNG": self.h = GNG(period = 50) self.h.train( trainingSet ) if self.method == "KNN": self.h = NearestNeighbors(algorithm='ball_tree', metric='euclidean').fit(trainingSet) if self.method == "RNN": self.h = NearestNeighbors(algorithm='ball_tree', metric='euclidean').fit(trainingSet) if self.method == "SVM": self.h = svm.OneClassSVM(nu=PARAMS["NU"], kernel="rbf", gamma=PARAMS["GAMMA"]).fit(trainingSet) def getAnomalyScore(self, x, inversed = False): if self.method == "online": alpha_m = self.h.getNearestDist(x) # alpha_m = self.h.getNearestDistToMature(x) if inversed == True: alpha_m = 1. / alpha_m if self.method == "centroid": alpha_m = Util.dist(x, self.h) if inversed == True: alpha_m = 1. / alpha_m if self.method == "medoid": alpha_m = Util.dist(x, self.h) if inversed == True: alpha_m = 1. / alpha_m if self.method == "IGNG": alpha_m = self.h.getNearestDist(x) if inversed == True: alpha_m = 1. / alpha_m if self.method == "GNG": alpha_m = self.h.getNearestDist(x) if inversed == True: alpha_m = 1. / alpha_m if self.method == "KNN": distances, indices = self.h.kneighbors( x, n_neighbors = PARAMS["K"] ) alpha_m = sum( distances[0] ) if inversed == True: alpha_m = 1. / alpha_m if self.method == "RNN": distances, indices = self.h.radius_neighbors(x, radius = PARAMS["R"]) alpha_m = 1. / ( 1. + sum( [ 1./di for di in distances[0] if di != 0 ] ) ) if inversed == True: alpha_m = 1. / alpha_m if self.method == "SVM": alpha_m = -1. * self.h.decision_function(x)[0][0] if inversed == True: alpha_m = -1. * alpha_m return alpha_m
def __init__(self, root_dir, cities='', nNeg=5, transform=None, mode='train', task='im2im', subtask='all', seq_length=1, posDistThr=10, negDistThr=25, cached_queries=1000, cached_negatives=1000, positive_sampling=True, bs=24, threads=8, margin=0.1, exclude_panos=True): # initializing assert mode in ('train', 'val', 'test') assert task in ('im2im', 'im2seq', 'seq2im', 'seq2seq') assert subtask in ('all', 's2w', 'w2s', 'o2n', 'n2o', 'd2n', 'n2d') assert seq_length % 2 == 1 assert (task == 'im2im' and seq_length == 1) or (task != 'im2im' and seq_length > 1) if cities in default_cities: self.cities = default_cities[cities] elif cities == '': self.cities = default_cities[mode] else: self.cities = cities.split(',') self.qIdx = [] self.qImages = [] self.pIdx = [] self.nonNegIdx = [] self.dbImages = [] self.sideways = [] self.night = [] self.all_pos_indices = [] # hyper-parameters self.nNeg = nNeg self.margin = margin self.posDistThr = posDistThr self.negDistThr = negDistThr self.cached_queries = cached_queries self.cached_negatives = cached_negatives # flags self.cache = None self.exclude_panos = exclude_panos self.mode = mode self.subtask = subtask print('Exclude panoramas:', self.exclude_panos) # other self.transform = transform # define sequence length based on task if task == 'im2im': seq_length_q, seq_length_db = 1, 1 elif task == 'seq2seq': seq_length_q, seq_length_db = seq_length, seq_length elif task == 'seq2im': seq_length_q, seq_length_db = seq_length, 1 else: # im2seq seq_length_q, seq_length_db = 1, seq_length # load data for city in self.cities: print("=====> {}".format(city)) subdir = 'test' if city in default_cities['test'] else 'train_val' # get len of images from cities so far for indexing _lenQ = len(self.qImages) _lenDb = len(self.dbImages) # when GPS / UTM is available if self.mode in ['train', 'val']: # load query data qData = pd.read_csv(join(root_dir, subdir, city, 'query', 'postprocessed.csv'), index_col=0) qDataRaw = pd.read_csv(join(root_dir, subdir, city, 'query', 'raw.csv'), index_col=0) # load database data dbData = pd.read_csv(join(root_dir, subdir, city, 'database', 'postprocessed.csv'), index_col=0) dbDataRaw = pd.read_csv(join(root_dir, subdir, city, 'database', 'raw.csv'), index_col=0) # arange based on task qSeqKeys, qSeqIdxs = self.arange_as_seq( qData, join(root_dir, subdir, city, 'query'), seq_length_q) dbSeqKeys, dbSeqIdxs = self.arange_as_seq( dbData, join(root_dir, subdir, city, 'database'), seq_length_db) # filter based on subtasks if self.mode in ['val']: qIdx = pd.read_csv(join(root_dir, subdir, city, 'query', 'subtask_index.csv'), index_col=0) dbIdx = pd.read_csv(join(root_dir, subdir, city, 'database', 'subtask_index.csv'), index_col=0) # find all the sequence where the center frame belongs to a subtask val_frames = np.where(qIdx[self.subtask])[0] qSeqKeys, qSeqIdxs = self.filter(qSeqKeys, qSeqIdxs, val_frames) val_frames = np.where(dbIdx[self.subtask])[0] dbSeqKeys, dbSeqIdxs = self.filter(dbSeqKeys, dbSeqIdxs, val_frames) # filter based on panorama data if self.exclude_panos: panos_frames = np.where( (qDataRaw['pano'] == False).values)[0] qSeqKeys, qSeqIdxs = self.filter(qSeqKeys, qSeqIdxs, panos_frames) panos_frames = np.where( (dbDataRaw['pano'] == False).values)[0] dbSeqKeys, dbSeqIdxs = self.filter(dbSeqKeys, dbSeqIdxs, panos_frames) unique_qSeqIdx = np.unique(qSeqIdxs) unique_dbSeqIdx = np.unique(dbSeqIdxs) # if a combination of city, task and subtask is chosen, where there are no query/dabase images, # then continue to next city if len(unique_qSeqIdx) == 0 or len(unique_dbSeqIdx) == 0: continue self.qImages.extend(qSeqKeys) self.dbImages.extend(dbSeqKeys) qData = qData.loc[unique_qSeqIdx] dbData = dbData.loc[unique_dbSeqIdx] # useful indexing functions seqIdx2frameIdx = lambda seqIdx, seqIdxs: seqIdxs[seqIdx] # frameIdx2seqIdx = lambda frameIdx, seqIdxs: np.where(seqIdxs == frameIdx)[0][1] frameIdx2uniqFrameIdx = lambda frameIdx, uniqFrameIdx: np.where( np.in1d(uniqFrameIdx, frameIdx))[0] uniqFrameIdx2seqIdx = lambda frameIdxs, seqIdxs: \ np.where(np.in1d(seqIdxs, frameIdxs).reshape(seqIdxs.shape))[0] # utm coordinates utmQ = qData[['easting', 'northing']].values.reshape(-1, 2) utmDb = dbData[['easting', 'northing']].values.reshape(-1, 2) night, sideways, index = qData['night'].values, ( qData['view_direction'] == 'Sideways').values, qData.index # find positive images for training neigh = NearestNeighbors(algorithm='brute') neigh.fit(utmDb) pos_distances, pos_indices = neigh.radius_neighbors( utmQ, self.posDistThr) self.all_pos_indices.extend(pos_indices) if self.mode == 'train': nD, nI = neigh.radius_neighbors(utmQ, self.negDistThr) for q_seq_idx in range(len(qSeqKeys)): q_frame_idxs = seqIdx2frameIdx(q_seq_idx, qSeqIdxs) q_uniq_frame_idx = frameIdx2uniqFrameIdx( q_frame_idxs, unique_qSeqIdx) p_uniq_frame_idxs = np.unique([ p for pos in pos_indices[q_uniq_frame_idx] for p in pos ]) # the query image has at least one positive if len(p_uniq_frame_idxs) > 0: p_seq_idx = np.unique( uniqFrameIdx2seqIdx( unique_dbSeqIdx[p_uniq_frame_idxs], dbSeqIdxs)) self.pIdx.append(p_seq_idx + _lenDb) self.qIdx.append(q_seq_idx + _lenQ) # in training we have two thresholds, one for finding positives and one for finding images # that we are certain are negatives. if self.mode == 'train': n_uniq_frame_idxs = np.unique([ n for nonNeg in nI[q_uniq_frame_idx] for n in nonNeg ]) n_seq_idx = np.unique( uniqFrameIdx2seqIdx( unique_dbSeqIdx[n_uniq_frame_idxs], dbSeqIdxs)) self.nonNegIdx.append(n_seq_idx + _lenDb) # gather meta which is useful for positive sampling if sum(night[np.in1d(index, q_frame_idxs)]) > 0: self.night.append(len(self.qIdx) - 1) if sum(sideways[np.in1d(index, q_frame_idxs)]) > 0: self.sideways.append(len(self.qIdx) - 1) # when GPS / UTM / pano info is not available elif self.mode in ['test']: # load images for subtask qIdx = pd.read_csv(join(root_dir, subdir, city, 'query', 'subtask_index.csv'), index_col=0) dbIdx = pd.read_csv(join(root_dir, subdir, city, 'database', 'subtask_index.csv'), index_col=0) # arange in sequences qSeqKeys, qSeqIdxs = self.arange_as_seq( qIdx, join(root_dir, subdir, city, 'query'), seq_length_q) dbSeqKeys, dbSeqIdxs = self.arange_as_seq( dbIdx, join(root_dir, subdir, city, 'database'), seq_length_db) # filter query based on subtask val_frames = np.where(qIdx[self.subtask])[0] qSeqKeys, qSeqIdxs = self.filter(qSeqKeys, qSeqIdxs, val_frames) # filter database based on subtask val_frames = np.where(dbIdx[self.subtask])[0] dbSeqKeys, dbSeqIdxs = self.filter(dbSeqKeys, dbSeqIdxs, val_frames) self.qImages.extend(qSeqKeys) self.dbImages.extend(dbSeqKeys) # add query index self.qIdx.extend(list(range(_lenQ, len(qSeqKeys) + _lenQ))) # if a combination of cities, task and subtask is chosen, where there are no query/database images, # then exit if len(self.qImages) == 0 or len(self.dbImages) == 0: print("Exiting...") print( "A combination of cities, task and subtask have been chosen, where there are no query/database images." ) print("Try choosing a different subtask or more cities") sys.exit() # cast to np.arrays for indexing during training self.qIdx = np.asarray(self.qIdx) self.qImages = np.asarray(self.qImages) self.pIdx = np.asarray(self.pIdx) self.nonNegIdx = np.asarray(self.nonNegIdx) self.dbImages = np.asarray(self.dbImages) self.sideways = np.asarray(self.sideways) self.night = np.asarray(self.night) # decide device type ( important for triplet mining ) self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.threads = threads self.bs = bs if mode == 'train': # for now always 1-1 lookup. self.negCache = np.asarray([np.empty( (0, ), dtype=int)] * len(self.qIdx)) # calculate weights for positive sampling if positive_sampling: self.__calcSamplingWeights__() else: self.weights = np.ones(len(self.qIdx)) / float(len(self.qIdx))
def pms(substack, args): """Find cells using mean shift. In this version, the substack is split into eight patches. Parameters ---------- substack : object :class:`bcfind.volume.SubStack` object representing the substack to be analyzed. args : object :py:class:`argparse.Namespace` object containing the arguments passed to the find_cells script, in particular - args.outdir: directory where results are saved - args.hi_local_max_radius: radius of the sphere used to decide whether a local maximum should be a seed - args.mean_shift_bandwidth: bandwidth for the mean shift algorithm - args.floating_point: bool, whether cell coordinates should be rounded before saving """ D = substack.info['Depth'] W = substack.info['Width'] H = substack.info['Height'] M = 20 patch = np.zeros((W, H, D)) for z in range(D): patch[:, :, z] = np.array(substack.imgs[z]).T slicesx = [slice(0, W / 2 + M), slice(W / 2 - M, W)] slicesy = [slice(0, H / 2 + M), slice(H / 2 - M, H)] slicesz = [slice(0, D / 2 + M), slice(D / 2 - M, D)] cluster_centers = np.zeros((0, 3)) cluster_masses = np.zeros(0) L = np.zeros((0, 3)) labels = np.zeros(0) seeds = [] counter = 0 for sx in slicesx: for sy in slicesy: for sz in slicesz: counter += 1 tee.log('%d/8:' % counter, 'Analyzing minisubstack', sx, sy, sz) rval = _patch_ms(patch[sx, sy, sz], args) origin = [sx.start, sy.start, sz.start] if rval is not None: cluster_centers = np.concatenate( (cluster_centers, rval.cluster_centers + origin)) cluster_masses = np.concatenate( (cluster_masses, rval.masses)) labels = np.concatenate( (labels, rval.labels + len(rval.cluster_centers))) L = np.concatenate((L, rval.L + origin)) for c in rval.seeds: c.x += origin[0] c.y += origin[1] c.z += origin[2] seeds.extend(rval.seeds) if len(cluster_centers) > 0: # remove near duplicate points (because of overlapping margins) indices = np.argsort(cluster_masses) sorted_centers = cluster_centers[indices] sorted_masses = cluster_masses[indices] # sorted_volumes = volumes[indices] unique = np.ones(len(sorted_centers), dtype=np.bool) # FIXME - make it a parameter nbrs = NearestNeighbors(radius=5.5).fit(sorted_centers) for i, center in enumerate(sorted_centers): if unique[i]: neighbor_idxs = nbrs.radius_neighbors([center], return_distance=False)[0] unique[neighbor_idxs] = 0 unique[i] = 1 # leave the current point as unique cluster_centers = sorted_centers[unique] masses = sorted_masses[unique] masses_mean = np.mean(masses) masses_std = np.std(masses) # volumes = sorted_volumes[unique] C = [] for i, cc in enumerate(cluster_centers): c = volume.Center(cc[0], cc[1], cc[2]) c.name = 'MS_center %d' % i c.volume = (masses[i] - masses_mean) / masses_std # volumes[i] c.mass = masses[i] tee.log(i, cc, c) C.append(c) filename = args.outdir + '/ms.marker' substack.save_markers(filename, C, floating_point=args.floating_point) tee.log('Markers saved to', filename) filename = args.outdir + '/seeds.marker' substack.save_markers(filename, seeds) tee.log(len(seeds), 'seeds saved to', filename) up_outdir = dirname(abspath(args.outdir)) if args.save_image: image_saver = volume.ImageSaver(up_outdir, substack, C) Lx = [int(x) for x in L[:, 0]] Ly = [int(y) for y in L[:, 1]] Lz = [int(z) for z in L[:, 2]] image_saver.save_above_threshold(Lx, Ly, Lz) tee.log('Debugging images saved in', up_outdir) else: tee.log('Debugging images not saved')
def posq_rrtstar(self, sample_fn,bias = 0.06): """ RRT* Algorithm """ print "KINODYNAMIC PLANNING" marker_points = MarkerArray() vol_freecells = len(self._freecells)*self._navmap.info.resolution**2 print "FREE CELL VOLUME", vol_freecells gamma_rrg = 2*sqrt(1.5*vol_freecells/pi) probot = np.array([self._robot_pose.pose.position.x,self._robot_pose.pose.position.y,2*np.arccos(self._robot_pose.pose.orientation.w)]) # V is a list of edges. E is a disctionary where the key is connected with its values. # parents is a disctionary where parent of key is value. Since is a key, each node has only one parent V = [probot] p4dist = np.zeros(4) p4dist[:2] = probot[:2];p4dist[2] = np.cos(probot[2]);p4dist[3] = np.sin(probot[2]) V_xy = [p4dist] # V_xy is the data structure for the nearest neighbours #V_xy = [probot[:2]] E = {} parents = {} Dist = [0.0] # C stores the cost at vertex idx which is hte sum of the edges going to it. goal_xy = np.array([self._goal.pose.position.x,self._goal.pose.position.y,2*np.arccos(self._goal.pose.orientation.w)]) c_init = self.cost_manager.get_cost(probot[:2],goal_xy[:2]) edge_C = {} planning_time=self.planning_time nbrs = NearestNeighbors(n_neighbors=1,algorithm="kd_tree",leaf_size = 30) lowest_cost_idx = None nbrs.fit(V_xy) t1 = time.time() planning_done = False rrt_iter = 0 pub_path = [] while not planning_done: t2 = time.time() #bias*=1.001 # gradually reduce the bias for the target """ Sampling new point """ reached = False samp_count = 0 alternative = True while reached ==False: prand,g_s = sample_fn(goal_xy,bias = bias) p4dist = np.zeros(4) p4dist[:2] = prand[:2];p4dist[2] = np.cos(prand[2]);p4dist[3] = np.sin(prand[2]) #(dist, idx) = nbrs.kneighbors(prand.reshape(1, -1)) (dist, idx) = nbrs.kneighbors(p4dist.reshape(1, -1)) pnearest_idx = idx.flatten(1)[0] pnearest = V[pnearest_idx] """ Turning new point into reachable point """ stp = 50 path_new,reached,stp = posq.simulate(pnearest,prand,steps = stp,return_steps=True) pnew = path_new[-1] if alternative == True: d = prand[:2] - pnearest[:2] ang = np.arctan2(d[1],d[0]) add = np.array([self._rrt_eta*np.cos(ang),self._rrt_eta*np.sin(ang),ang]) pnew =np.zeros(3) pnew[:2] = pnearest[:2]+add[:2] pnew[2] = ang #self.publish_local_path(pub_path) #pnew = [pnearest[0]+ self._rrt_eta*np.cos(pnearest[2]),pnearest[1]+ self._rrt_eta*np.sin(pnearest[2]),pnearest[2]] if reached == True: pnew = prand elif reached == False: stp = 400 path_new,reached,stp = posq.simulate(pnearest,pnew,steps = stp,return_steps = True,eps=0.1) pnew = path_new[-1] """ Checking if segment is valid and updating graph """ #stp = 40 #stp = 30 if self.path_safe(path_new): r = np.min([gamma_rrg*sqrt(log(len(V))/float(len(V))),self._rrt_eta]) p4dist = np.zeros(4) p4dist[:2] = pnew[:2];p4dist[2] = np.cos(pnew[2]);p4dist[3] = np.sin(pnew[2]) Pnear_idx = nbrs.radius_neighbors(p4dist.reshape(1, -1), r, return_distance = False) #Pnear_idx = nbrs.radius_neighbors(p4dist.reshape(1, -1), r, return_distance = False) Pnear_idx = Pnear_idx[0] pmin_idx = pnearest_idx min_edge_c = self.cost_manager.path_cost(path_new,goal_xy[:2]) cum_c = self.integrate_costs(edge_C,parents,pnearest_idx) cmin = cum_c +min_edge_c #if len(Pnear_idx)>5: # Pnear_idx = Pnear_idx[:5] cumulative_costs = [] for p_idx in Pnear_idx: p = V_xy[p_idx] p_xyz = V[p_idx] cum_cost = self.integrate_costs(edge_C,parents,p_idx) cumulative_costs.append(cum_cost) # WATCH OUT. You might get a nearest neightbour problem if the steps are not good enough. # perhaps we can have a distance simulation so that the nearest neighbor calculation remains consistent. p_idx_path,reached = posq.simulate(p_xyz,pnew,steps = int(stp),eps = 0.1) #reached = False safe = self.path_safe(p_idx_path) if reached == True and safe == True: path_c = self.cost_manager.path_cost(p_idx_path,goal_xy[:2]) else: path_c = 0 #reached = False c = cum_cost + path_c if (safe is True and reached is True and c < cmin): cmin = c min_edge_c = path_c pmin_idx = p_idx if E.has_key(pmin_idx): E[pmin_idx].add(len(V)) else: E[pmin_idx] = set([len(V)]) edge_C[pmin_idx,len(V)] = min_edge_c cumulative_last = cmin pnew_idx = len(V) V.append(pnew) #V_xy.append(pnew[:2]) V_xy.append(p4dist) parents[pnew_idx] = pmin_idx """ Re-wire the tree """ for en,p_idx in enumerate(Pnear_idx): # so if the near nodes, have children #parent if parents.has_key(p_idx): p = V_xy[p_idx] p_xyz = V[p_idx] rewire_path,rewire_reached = posq.simulate(pnew,p_xyz,steps = int(stp),eps = 0.1) #rewire_reached = False rewire_safe = self.path_safe(rewire_path) if rewire_reached == True and rewire_safe == True: rewire_path_c = self.cost_manager.path_cost(rewire_path,goal_xy[:2]) else: rewire_path_c = 0 c = cumulative_last + rewire_path_c if (rewire_safe is True and c < cumulative_costs[en] and rewire_reached is True): E[parents[p_idx]].remove(p_idx) edge_C.pop(parents[p_idx],p_idx) edge_C[pnew_idx,p_idx] = rewire_path_c parents[p_idx] = pnew_idx if E.has_key(pnew_idx): E[pnew_idx].add(p_idx) else: E[pnew_idx] = set([p_idx]) nbrs.fit(V_xy) rrt_iter +=1 if time.time()-t1>self.max_planning_time: p4dist = np.zeros(4) p4dist[:2] = goal_xy[:2];p4dist[2] = np.cos(goal_xy[2]);p4dist[3] = np.sin(goal_xy[2]) dist,points_near_goal = nbrs.radius_neighbors(p4dist, self.goal_tolerance+0.2, return_distance = True) points_near_goal = points_near_goal[0] points_near_goal = [] add = 0 while len(points_near_goal)==0: dist,points_near_goal = nbrs.radius_neighbors(p4dist, self.goal_tolerance+add, return_distance = True) points_near_goal = points_near_goal[0] add +=0.1 print "Could not find solution for 10 seconds, going with solution closest to goal." planning_done = True elif time.time()-t1>planning_time: p4dist = np.zeros(4) p4dist[:2] = goal_xy[:2];p4dist[2] = np.cos(goal_xy[2]);p4dist[3] = np.sin(goal_xy[2]) dist,points_near_goal = nbrs.radius_neighbors(p4dist, self.goal_tolerance+0.2, return_distance = True) #dist,points_near_goal = nbrs.radius_neighbors(goal_xy, self.goal_tolerance, return_distance = True) dist,point = nbrs.kneighbors(p4dist) print "DISTANCE FROM CLOSEST",dist points_near_goal = points_near_goal[0] if len(points_near_goal)==0: planning_done = False planning_time+=5. if bias < 0.5: bias =0.9 else: planning_done = True #self.publish_rrt(V,E) #self.samp_point_pub.publish(marker_points) """ Find best path: """ min_cost = 20000000; for i in points_near_goal: c_path = self.integrate_costs(edge_C,parents,i) if c_path < min_cost: m = i min_cost = c_path print len(V) self.publish_rrt(V,E) print "MINIMUM PATH COST RRT",min_cost path = self.get_path(parents,V,m) pt = path_to_pose(path) print 'total time: ', time.time()-t1 self._path_pub.publish(pt) return pt,path
class NeighborSimilarity(ISimilarity): """The neighborhood similarity model. The neighbor similarity model determines similarity between the data in the indexing structure and the query data by using the nearest neighbor algorithm :class:`sklearn.neighbors.NearestNeighbors`. Both a k-neighbors classifier and a radius-neighbor-classifier are implemented. To choose between the classifiers either `n_neighbors` or `radius` must be specified. Parameters ---------- n_neighbors : int The number of data points considered to be closest neighbors. radius : int The radius around the query data point, within which the data points are considered closest neighbors. algorithm : str The internal indexing structure of the training data. Defaults to `kd-tree`. metric : str The metric used to compute the distances between pairs of points. Refer to :class:`sklearn.neighbors.DistanceMetric` for valid identifiers. Default is `euclidean`. metric_params : dict Parameters relevant to the specified metric. Raises ------ UserWarning : If the either both or none of `n_neighbors` and `radius` are given. See Also -------- :class:`sklearn.neighbors.KNeighborsClassifier`, :class:`sklearn.neighbors.RadiusNeighborsClassifier` """ def __init__(self, n_neighbors=None, radius=None, algorithm=None, metric=None, metric_params=None): super(NeighborSimilarity, self).__init__() if (n_neighbors is not None and radius is not None) or not (n_neighbors is None or radius is None): raise UserWarning("Exactly one of n_neighbors or radius must be initialized.") self._n_neighbors = n_neighbors self._radius = radius if algorithm is not None: if algorithm not in ["ball_tree", "kd_tree", "brute", "auto"]: raise ValueError("%s is not a valid retrieval algorithm" % algorithm) self._algorithm = algorithm else: self._algorithm = "kd_tree" if metric is not None: if metric not in METRIC_MAPPING: raise ValueError("%s is not a valid retrieval metric" % metric) self._metric = metric else: self._metric = "euclidean" self._metric_params = metric_params if metric_params is not None else 2 def build_indexing_structure(self, data, id_map): """Build the indexing structure. Build the indexing structure by fitting the data according to the specified algorithm. Parameters ---------- data : ndarray[ndarray[float]] The raw data points to be indexed. id_map : dict[int, int] The mapping from the data points to their case ids. """ self._id_map = id_map if self._n_neighbors is not None: self._indexing_structure = NearestNeighbors(n_neighbors=self._n_neighbors, algorithm=self._algorithm, metric=self._metric, p=self._metric_params).fit(data) else: self._indexing_structure = NearestNeighbors(radius=self._radius, algorithm=self._algorithm, metric=self._metric, p=self._metric_params).fit(data) def compute_similarity(self, data_point): """Computes the similarity. Computes the similarity between the data point and the data in the indexing structure using the :class:`sklearn.neighbors.NearestNeighbors` algorithm. The results are returned in a collection of similarity statistics (:class:`Stat`). Parameters ---------- data_point : list[float] The raw data point to compare against the data points stored in the indexing structure. Returns ------- list[Stat] : A collection of similarity statistics. """ if self._n_neighbors is not None: # noinspection PyProtectedMember raw_data = self._indexing_structure._fit_X if len(raw_data) < self._n_neighbors: result = [] for i, feat in enumerate(raw_data): dist = np.linalg.norm(np.asarray(data_point) - np.asarray(feat)) result.append(Stat(self._id_map[i], dist)) # noinspection PyShadowingNames result = sorted(result, key=lambda x: x.similarity) else: d, key_lists = self._indexing_structure.kneighbors(data_point) result = [Stat(self._id_map[x], d[0][i]) for i, x in enumerate(key_lists[0])] else: d, key_lists = self._indexing_structure.radius_neighbors(data_point) result = [Stat(self._id_map[x], d[0][i]) for i, x in enumerate(key_lists[0])] return result
def make_cached_rrt(self,sample_fn,points_to_cache = 4500,bias=0.02): """ CAching the RRT """ print "NOW CACHING RRT ---" marker_points = MarkerArray() vol_freecells = len(self._freecells)*self._navmap.info.resolution**2 gamma_rrg = 2*sqrt(1.5*vol_freecells/pi) probot = np.array([self._robot_pose.pose.position.x,self._robot_pose.pose.position.y,2*np.arccos(self._robot_pose.pose.orientation.w)]) # V is a list of edges. E is a disctionary where the key is connected with its values. # parents is a disctionary where parent of key is value. Since is a key, each node has only one parent V = [probot] p4dist = np.zeros(4) p4dist[:2] = probot[:2];p4dist[2] = np.cos(probot[2]);p4dist[3] = np.sin(probot[2]) V_xy = [p4dist] sampled_points = [] Dist = [0.0] # C stores the cost at vertex idx which is hte sum of the edges going to it. goal_xy = np.array([self._goal.pose.position.x,self._goal.pose.position.y,2*np.arccos(self._goal.pose.orientation.w)]) edge_C = {} planning_time=self.planning_time nbrs = NearestNeighbors(n_neighbors=1,algorithm="kd_tree",leaf_size = 30) lowest_cost_idx = None nbrs.fit(V_xy) t1 = time.time() planning_done = False rrt_iter = 0 bias = 0.02 stp = 8 while not planning_done: cached_nbrs ={} t2 = time.time() #bias/=1.001 # gradually reduce the bias for the target """ Sampling new point """ reached = False samp_count = 0 alternative = True while reached ==False: prand,g_s = sample_fn(goal_xy,bias = bias) p4dist = np.zeros(4) p4dist[:2] = prand[:2];p4dist[2] = np.cos(prand[2]);p4dist[3] = np.sin(prand[2]) #(dist, idx) = nbrs.kneighbors(prand.reshape(1, -1)) (dist, idx) = nbrs.kneighbors(p4dist.reshape(1, -1)) pnearest_idx = idx.flatten(1)[0] pnearest = V[pnearest_idx] """ Turning new point into reachable point """ stp = 50 path_new,reached,stp = posq.simulate(pnearest,prand,steps = stp,return_steps=True) pnew = path_new[-1] if alternative == True: d = prand[:2] - pnearest[:2] ang = np.arctan2(d[1],d[0]) add = np.array([self._rrt_eta*np.cos(ang),self._rrt_eta*np.sin(ang),ang]) pnew =np.zeros(3) pnew[:2] = pnearest[:2]+add[:2] pnew[2] = ang #self.publish_local_path(pub_path) #pnew = [pnearest[0]+ self._rrt_eta*np.cos(pnearest[2]),pnearest[1]+ self._rrt_eta*np.sin(pnearest[2]),pnearest[2]] if reached == True: pnew = prand elif reached == False: stp = 400 path_new,reached,stp = posq.simulate(pnearest,pnew,steps = stp,return_steps = True,eps=0.1) pnew = path_new[-1] """ Checking if segment is valid and updating graph """ #stp = 40 #stp = 30 """ Checking if segment is valid and updating graph """ if self.path_safe(path_new) is True: r = np.min([gamma_rrg*sqrt(log(len(V))/float(len(V))),self._rrt_eta]) p4dist = np.zeros(4) p4dist[:2] = pnew[:2];p4dist[2] = np.cos(pnew[2]);p4dist[3] = np.sin(pnew[2]) #Pnear_idx = nbrs.radius_neighbors(pnew.reshape(1, -1)[:,:2], r, return_distance = False) Pnear_idx = nbrs.radius_neighbors(p4dist.reshape(1, -1), r, return_distance = False) Pnear_idx = Pnear_idx[0] cached_nbrs["prand"] = prand cached_nbrs["pnearest_idx"] = pnearest_idx cached_nbrs["path_new"] = path_new cached_nbrs["pnew"] = pnew cached_nbrs["path_new"] = path_new cached_nbrs["Pnear_idx"] = Pnear_idx cached_nbrs["Pnear_forward"] = [] cached_nbrs["Pnear_backward"] = [] cached_nbrs["pnear_pnew"] = [] cached_nbrs["pnew_pnear"] = [] for p_idx in Pnear_idx: p = V_xy[p_idx] p_xyz = V[p_idx] #path_forward,reached_forward = posq.simulate(p_xyz,pnew,steps = int(stp)) path,reached = posq.simulate(p_xyz,pnew,steps = int(stp)) if reached == True: safe = self.path_safe(path) if safe is True: path_info = ({"path":path,"reached":reached,"safe":safe}) cached_nbrs["pnear_pnew"].append(path_info) cached_nbrs["Pnear_forward"].append(p_idx) # else: # path_forward_safe = None path,reached = posq.simulate(pnew,p_xyz,steps = int(stp)) if reached == True: safe = self.path_safe(path) if safe == True: path_info = ({"path":path,"reached":reached,"safe":safe}) cached_nbrs["pnew_pnear"].append(path_info) cached_nbrs["Pnear_backward"].append(p_idx) # else: # path_backward_safe = None #path_info = ({"forward":path_forward,"reached_forward":reached_forward,"safe_forward":path_forward_safe, #"backward":path_backward,"reached_backward":reached_backward,"safe_backward":path_backward_safe}) #cached_nbrs["pnear_pnew"].append(path_info) V.append(pnew) V_xy.append(p4dist) nbrs.fit(V_xy) #mark = self.make_sample_marker(pnew) #marker_points.markers.append(mark) sampled_points.append(cached_nbrs) rrt_iter +=1 if len(V) == points_to_cache-50: bias = 0.9 if len(V) == points_to_cache: planning_done=True print "Number of cached points:",len(V) print "time taken",time.time()-t1 return sampled_points
for i in range(nb_items): items[i, 0] = np.random.randint(0, 100) items[i, 1] = np.random.randint(0, 100) items[i, 2] = np.random.randint(0, 100) items[i, 3] = np.random.randint(0, 100) metrics = ['euclidean', 'hamming', 'jaccard'] for metric in metrics: print('Metric: %r' % metric) # Fit k-nearest neighbors nn = NearestNeighbors(n_neighbors=10, radius=5.0, metric=metric) nn.fit(items) # Create a test product test_product = np.array([15, 60, 28, 73]) # Determine the neighbors with different radiuses d, suggestions = nn.radius_neighbors(test_product.reshape(1, -1), radius=20) print('Suggestions (radius=10):') print(suggestions) d, suggestions = nn.radius_neighbors(test_product.reshape(1, -1), radius=30) print('Suggestions (radius=15):') print(suggestions)
plt.imsave('../images/baboon_mean_shift.png', babbon_img) fig = plt.figure(figsize=(30, 30)) fig.add_subplot(1, 3, 1) plt.imshow(smooth_baboon[::2, ::2]) plt.title("Smooth Baboon Image") plt.colorbar() fig.add_subplot(1, 3, 2) plt.imshow(babbon_img) plt.title("Mean Shift Image without clustering") plt.colorbar() #after calculating segmented image calculating clusters based on radius parameter across intensities values nbrs = NearestNeighbors(radius=0.007, algorithm='auto').fit(data_c[:, :3]) dis, ind = nbrs.radius_neighbors(data_c[:, :3]) new_data = np.copy(data_c) for i in range(len(ind)): nn = data_c[list(ind[i])] nn[:, :3] = new_data[i, :3] # assign same pixel values with neighbours new_data[list(ind[i]), :3] = nn[:, :3] seg_img = np.zeros_like(baboon_array[::2, ::2]) for d in new_data: seg_img[int(d[3]), int(d[4])] = d[:3] plt.imsave('../images/baboon_clustered.png', seg_img) fig.add_subplot(1, 3, 3)
class NearestNeighborsRatioEstimator(object): """ Nearest neighbor ratio estimator """ def __init__(self, n_neighbors=2): """ Instantiates the learner. n_neighbors: number of neighbors for KNN estimator """ self.n_neighbors = n_neighbors def get_params(self, deep=True): """ Get parameters (for scikit-learn) """ return {"n_neighbors": self.n_neighbors} def set_params(self, **parameters): """ Set parameters (for scikit-learn) """ for parameter, value in parameters.items(): self.setattr(parameter, value) def fit(self, X_tr, X_te): """ Fit the model X_tr: training sample X_te: test sample """ self.n_tr = X_tr.shape[0] self.n_te = X_te.shape[0] # build kd-trees for both domains self.nbrs_tr = NearestNeighbors(n_neighbors=self.n_neighbors, algorithm='kd_tree').fit(X_tr) self.nbrs_te = NearestNeighbors(n_neighbors=self.n_neighbors, algorithm='kd_tree').fit(X_te) def fit_cv(self, X_tr, X_te, K_list, n_cv=5, n_jobs=0, shuffle=False, random_state=42): """ Fit the model using cross-validation to find an optimal number of K neighbors X_tr: training sample X_te: test sample K_list: list of K values to consider for cross-validation n_cv: number of folds for cross-validation n_jobs: number of jobs to use when running cross-validation in parallel, If 0 or 1, run jobs sequentially. If -1, take len(K_list) jobs. """ if n_jobs == -1: n_jobs = len(K_list) kf_tr = KFold(n_splits=n_cv, shuffle=shuffle, random_state=random_state).split(X_tr) kf_te = KFold(n_splits=n_cv, shuffle=shuffle, random_state=random_state).split(X_te) kf_tr, kf_te = list(kf_tr), list(kf_te) if n_jobs == 0 or n_jobs == 1: self.losses = [] for K in K_list: self.losses.append(cv_loss(X_tr, X_te, kf_tr, kf_te, K, n_cv)) else: self.losses = Parallel(n_jobs=n_jobs)( delayed(cv_loss)(X_tr, X_te, kf_tr, kf_te, K, n_cv) for K in K_list) self.n_neighbors = K_list[np.argmin(self.losses)] print(f"Optimal K neighbors: {self.n_neighbors}") self.fit(X_tr, X_te) def compute_weights(self, X_ev): """ Predicts weights for a set of (evaluation) patterns. X_ev: sample of data points at which we evaluate the density ratio weights returns: weights for each data point in the given sample """ # get K nearest neighbors (and radii) from training domain distances, ind = self.nbrs_tr.kneighbors(X_ev) radii = distances[:, -1] # compute weights weights = np.zeros(X_ev.shape[0]) for i in range(X_ev.shape[0]): # count number of numerator sample within the current radius of the query sample weights[i] = len( self.nbrs_te.radius_neighbors(X_ev[i, :].reshape(1, -1), radius=radii[i], return_distance=False)[0]) # divide K denominator samples and normalize by the ratio of denominator to numerator samples weights *= float(self.n_tr) / float(self.n_neighbors * self.n_te) return weights
class Search: """ This class is used to apply neighbourhood search algorithms on point clouds. K-nearest neighbours and radius search is implemented. """ def __init__(self, inputCloud): """ Inputs: inputCloud: np.array of row vectors shape (<numberPoints>, 3) """ self.inputCloud = inputCloud self.initializer = 0 def nearestKSearch(self, point, k): """ Search for k-nearest neighbors for the given query point. Inputs: point query point as vector with dimension (1,3) or (3,1) k number of nearest neighbors that shall be included in output set Outputs: outputPointSet k points that are closest to query point. Shape: (len,3) with 0<=len<=k meaning (x,y,z) for each point in the set. Query point is also included at last position. Based on method in: docs.pointclouds.org/1.9.1/classpcl_1_1search_1_1_search.html and https://scikit-learn.org/stable/modules/neighbors.html """ 'Column to row vector as required by method kneighbors' if point.shape == (3, 1) or point.shape == (3, ): point = np.reshape(point, (1, 3)) 'avoid error due to less points given than k' if self.inputCloud.shape[0] < k: k = self.inputCloud.shape[0] if self.initializer == 0: 'Initialisation of nearest neighbour class' self.neigh = NearestNeighbors(n_neighbors=k) self.neigh.fit(self.inputCloud) self.initializer = 1 'Calculate k nearest neighbours for given point' result = self.neigh.kneighbors(point) outputPointSet = result[1][0] 'Note: May use result[0][0] to get distance for each pair' return outputPointSet def radiusSearch(self, point, radius, max_nn=float("inf")): ''' Search for all the nearest neighbors of the query point in a given radius. Inputs: point query point as vector with dimension (1,3) or (3,1) radius radius of sphere to determine included points max_nn maximum number of included points. Default infinit Outputs: outputPointSet points that are included in the sphere with given radius around query point. Points on sphere are included. Shape: (len,3) meaning (x,y,z) for each point in the set. Query point is also included at last position. Based on method in http://docs.pointclouds.org/1.9.1/classpcl_1_1search_1_1_search.html and https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.RadiusNeighborsClassifier.html ''' 'Column to row vector as required by method radius_neighbors' if point.shape == (3, 1) or point.shape == (3, ): point = np.reshape(point, (1, 3)) if self.initializer == 0: 'Initialisation of nearest neighbour class' self.neigh = NearestNeighbors(radius=radius) self.neigh.fit(self.inputCloud) self.initializer = 1 'Calculate nearest neighbours \ Uses Minkowski distance with p=2 as default which is equal to \ Euclidean distance' result = self.neigh.radius_neighbors(point) outputPointSet = result[1][0] 'Note: May use result[0][0] to get distance for each point in \ the sphere' return outputPointSet
def path_freq(arr, voxdim, freq_threshold): vox = np.array(arr / voxdim).astype(int) voxels = defaultdict(list) for i, v in enumerate(vox): voxels[tuple(v)].append(i) vox_uniques = remove_duplicates(vox) vox_uniques = vox_uniques * voxdim # vox_uids = np.arange(vox_uniques.shape[0]) center = base_center(arr, base_length=0.3)[0] G = create_graph_iter(vox_uniques, n_neighbors=5, nn_step=2, dist_threshold=np.inf, maxiter=20) print('Graph created') nbrs = NearestNeighbors(leaf_size=15, n_jobs=-1).fit(vox_uniques) base_id = nbrs.kneighbors(center.reshape(1, -1), 1, return_distance=False)[0][0] mask = np.zeros(G.number_of_nodes()) # Calculating the shortest path shortpath = nx.single_source_dijkstra_path_length(G, base_id) # Obtaining the node coordinates and their respective distance from # the base point. nodes_ids = shortpath.keys() dist = shortpath.values() # Obtaining path list for every node. path = nx.single_source_dijkstra_path(G, base_id) # Obtaining nodes coordinates. nodes = vox_uniques[nodes_ids] dist = np.array(dist) path = path.values() path_nodes = [i for j in path for i in j] # Obtaining all unique values in the central nodes path and their # respective frequency. path_nodes, freq = np.unique(path_nodes, return_counts=True) # Log transforming the frequency values. freq_log = np.log(freq) # Filtering the central nodes based on the frequency of paths # that contains each node. freq_mask = (freq_log >= (np.max(freq_log) * freq_threshold)).astype(bool) p = nodes[freq_mask] pdist = dist[freq_mask] nbrs = NearestNeighbors(leaf_size=15, n_jobs=-1).fit(nodes) nbrs_ids = nbrs.radius_neighbors(p, radius=voxdim * 3, return_distance=False) mask[freq_mask.astype(bool)] = 1 for p_id, idx in enumerate(nbrs_ids): for id_ in idx: if dist[id_] <= pdist[p_id]: mask[id_] = 1 mask = mask.astype(bool) e = np.inf threshold = 1 dist_threshold = voxdim * 4 print('Starting region growing') while e > threshold: nbrs = NearestNeighbors(leaf_size=15, n_jobs=-1).fit(vox_uniques[~mask]) e1 = np.sum(mask) nbrs_dist, nbrs_ids = nbrs.kneighbors(vox_uniques[mask], 1) for i, nbr_i in enumerate(nbrs_ids[nbrs_dist <= dist_threshold]): if dist[nbr_i] <= dist[mask][i]: mask[nbr_i] = True e2 = np.sum(mask) e = e2 - e1 # e = nbrs_ids.shape[0] print e vids = [] new_voxels = (vox_uniques[mask] / voxdim).astype(int) for i in new_voxels: vids.append(voxels[tuple(i)]) voxels_ids = np.unique([i for j in vids for i in j]) return arr[voxels_ids]
def rrtstar(self, sample_fn): """ RRT* Algorithm """ vol_freecells = len(self._freecells)*self._navmap.info.resolution**2 gamma_rrg = 2*sqrt(1.5*vol_freecells/pi) max_range = self.utility_function.getMaximumSensorRange() print 'max range is ', max_range probot = np.array([self._robot_pose.pose.position.x,self._robot_pose.pose.position.y]) V = [probot] E = {} parents = {} W = [self.current_weights] w = ap_utility.VectorOfDoubles() w_post = ap_utility.VectorOfDoubles() w.extend(self.current_weights) Ent = [self.utility_function.computeExpEntropy(probot[0], probot[1], 0.0, w, w_post)] Dist = [0.0] C = [float('Inf')] nbrs = NearestNeighbors(n_neighbors=1) nbrs.fit(V) cmin = 0 t1 = time.time() informative_point_found = False planning_done = False rrt_iter = 0 while not planning_done: t2 = time.time() """ Sampling new point """ prand = sample_fn() (dist, idx) = nbrs.kneighbors(prand) pnearest_idx = idx.flatten(1)[0] pnearest = V[pnearest_idx] """ Turning new point into reachable point """ if dist < self._rrt_eta: pnew = prand else: pnew = self.steer(pnearest, prand) """ Checking if segment is valid and updating graph """ if self.segment_safe(V[pnearest_idx],pnew) is True: r = np.min([gamma_rrg*sqrt(log(len(V))/float(len(V))),self._rrt_eta]) Pnear_idx = nbrs.radius_neighbors(pnew, r, return_distance = False) Pnear_idx = Pnear_idx[0] pmin_idx = pnearest_idx w = ap_utility.VectorOfDoubles() w_post = ap_utility.VectorOfDoubles() w.extend(W[pnearest_idx]) (dist_nearest_particle, idx) = self._particle_nbrs.kneighbors(pnew) if dist_nearest_particle < max_range: #if at least one particle is visible entropy = self.utility_function.computeExpEntropy(pnew[0], pnew[1], 0.0, w, w_post) if dist_nearest_particle >= max_range or entropy == 0: # utility function failed entropy = Ent[pmin_idx] w_post = w dist = np.linalg.norm(pnearest-pnew) cmin = (self._rrt_near_bias*dist_nearest_particle + self._rrt_dist_bias * (Dist[pnearest_idx] + dist) + self._rrt_entropy_bias * entropy) for p_idx in Pnear_idx: p = V[p_idx] w = ap_utility.VectorOfDoubles() w_near = ap_utility.VectorOfDoubles() w.extend(W[p_idx]) if np.abs(Ent[p_idx] - entropy) < 1e-6: # if there is anything to gain in terms of information entropy_near = self.utility_function.computeExpEntropy(pnew[0], pnew[1], 0.0, w, w_near) else: entropy_near = Ent[p_idx] (dist_nearest_particle, idx) = self._particle_nbrs.kneighbors(p) c = (self._rrt_near_bias*dist_nearest_particle + self._rrt_dist_bias * (Dist[p_idx] + np.linalg.norm(p-pnew)) + self._rrt_entropy_bias * entropy_near) if (self.segment_safe(p,pnew) is True and c < cmin): cmin = c pmin_idx = p_idx if E.has_key(pmin_idx): E[pmin_idx].add(len(V)) else: E[pmin_idx] = set([len(V)]) pnew_idx = len(V) V.append(pnew) C.append(cmin) W.append(w_post) Ent.append(entropy) Dist.append(Dist[pmin_idx] + dist) parents[pnew_idx] = pmin_idx """ Re-wire the tree """ for p_idx in Pnear_idx: if parents.has_key(p_idx): p = V[p_idx] w = ap_utility.VectorOfDoubles() w_near = ap_utility.VectorOfDoubles() w.extend(W[-1]) # pnew if np.abs(Ent[p_idx] - entropy) <= 0: #1e-6: # if there is anything to gain in terms of information entropy_near = self.utility_function.computeExpEntropy(pnew[0], pnew[1], 0.0, w, w_near) else: entropy_near = Ent[p_idx] w_near = w dist = np.linalg.norm(p-pnew) (dist_nearest_particle, idx) = self._particle_nbrs.kneighbors(p) c = (self._rrt_near_bias*dist_nearest_particle + self._rrt_dist_bias * (Dist[-1] + dist) + self._rrt_entropy_bias * entropy_near) if (self.segment_safe(p,pnew) is True and c < C[p_idx]): E[parents[p_idx]].remove(p_idx) parents[p_idx] = pnew_idx print 'rewired ',p_idx,'to',pnew_idx if E.has_key(pnew_idx): E[pnew_idx].add(p_idx) else: E[pnew_idx] = set([p_idx]) C[p_idx] = c W[p_idx] = w_near Ent[p_idx] = entropy_near Dist[p_idx] = Dist[-1] + dist nbrs.fit(V) # print 'iteration done. time: ', time.time()-t2 # print 'min entropy:', np.min(I) if np.max(Ent) - np.min(Ent) >= 1e-6: # just to compensate arithmetic noise informative_point_found = True """ Find best path: """ path = self.get_best_path(parents, V, C) if informative_point_found and len(path.poses) > self._max_path_size: planning_done = True rrt_iter += 1 if rrt_iter > self._max_rrt_iterations: planning_done = True if not informative_point_found: rospy.logwarn("Could not find an informative goal point in %d iterations! Aborting.", self._max_rrt_iterations) print 'total time: ', time.time()-t1 self.publish_rrt(V,E) self._path_pub.publish(path) self.publish_entropy_info(V, Ent) return path
def mean_shift(X, bandwidth, n_seeds, kernel_function='gaussian', max_iterations=100, proximity_thresh=5): ''' ---Parameters--- X : data in form (samples, dims) bandwidth : radius of nearest neighbors n_seeds : kernel_update_function : can be "gaussian" or "flat" or your own kernel proximity_thresh : minimum distance (in pixels) a new cluster must be away from previous ones ---Returns--- cluster_centers : cluster_counts : how many pixels are with the neighborhood of each cluster ''' import numpy as np from sklearn.neighbors import BallTree, NearestNeighbors from sklearn.utils import extmath from sklearn.metrics.pairwise import euclidean_distances from collections import defaultdict if kernel_function == 'gaussian': kernel_update_function = gaussian_kernel elif kernel_function == 'flat': kernel_update_function = flat_kernel else: kernel_update_function = kernel_function n_points, n_features = X.shape stop_thresh = 1e-2 * bandwidth # when mean has converged cluster_centers = [] cluster_counts = [] # ball_tree = BallTree(X)# to efficiently look up nearby points neighbors = NearestNeighbors(radius=bandwidth).fit(X) seeds = X[(np.random.uniform(0, X.shape[0], n_seeds)).astype(np.int)] # For each seed, climb gradient until convergence or max_iterations for weighted_mean in seeds: completed_iterations = 0 while True: points_within = X[neighbors.radius_neighbors( [weighted_mean], bandwidth, return_distance=False)[0]] old_mean = weighted_mean # save the old mean weighted_mean = kernel_update_function(old_mean, points_within, bandwidth) converged = extmath.norm(weighted_mean - old_mean) < stop_thresh if converged or completed_iterations == max_iterations: # Only add cluster if it's different enough from other centers if len(cluster_centers) > 0: diff_from_prev = [ np.linalg.norm(weighted_mean - cluster_centers[i], 2) for i in range(len(cluster_centers)) ] if np.min(diff_from_prev) > proximity_thresh: cluster_centers.append(weighted_mean) cluster_counts.append(points_within.shape[0]) else: cluster_centers.append(weighted_mean) cluster_counts.append(points_within.shape[0]) break completed_iterations += 1 return cluster_centers, cluster_counts
def __init__(self, nNegSample=1000, nNeg=10, margin=0.1, input_transform=None): super().__init__() self.img_list = join(root, yeouido_img_list_txt) self.images = [e.strip() for e in open(self.img_list)] self.input_transform = input_transform() self.nNegSample = nNegSample # number of negatives to randomly sample self.nNeg = nNeg # number of negatives used for training self.margin = margin self.position = np.load(yeouido_position_npy) self.positive_thres = 5 self.negative_thres = 20 All_idx = np.arange(0, len(self.images), step=1) self.Qidx = np.arange(0, len(self.images), step=4) self.DBidx = np.setdiff1d(All_idx, self.Qidx) #self.DBidx = np.arange(int(len(self.images)/2)) *2 #self.Qidx = self.DBidx +1 np.random.shuffle(self.DBidx) np.random.shuffle(self.Qidx) # potential positives are those within nontrivial threshold range #fit NN to find them, search by radius #knn = kNN_GPU(d=len(get_multiple_elements(self.position,self.DBidx)[0]), GPU = True, GPU_Number=torch.cuda.current_device()) #knn.train(np.array(get_multiple_elements(self.position,self.DBidx)).astype("float32")) knn_cpu = NearestNeighbors(n_jobs=-1, metric='euclidean') knn_cpu.fit(get_multiple_elements(self.position, self.DBidx)) # potential negatives are those outside of posDistThr range self.potential_positives = knn_cpu.radius_neighbors( get_multiple_elements(self.position, self.Qidx), radius=self.positive_thres, return_distance=False) #self.potential_positives = knn.predict(np.asarray(get_multiple_elements(self.position,self.Qidx)).astype("float32"), 10) # sort indecies of potential positives for i, positive_indices in enumerate(self.potential_positives): self.potential_positives[i] = np.sort(positive_indices) # it's possible some queries don't have any non trivial potential positives self.queries = np.where( np.array([len(x) for x in self.potential_positives]) > 0)[0] # for potential negatives potential_unnegatives = knn_cpu.radius_neighbors( get_multiple_elements(self.position, self.Qidx), radius=self.negative_thres, return_distance=False) #potential_unnegatives = knn.predict(np.asarray(get_multiple_elements(self.position,self.Qidx)).astype("float32"), 20) # potential negatives' indices of DBidx away then 25 meters self.potential_negatives = [] for pos in potential_unnegatives: self.potential_negatives.append( np.setdiff1d(np.arange(self.DBidx.shape[0]), pos, assume_unique=True)) self.cache = None # filepath of HDF5 containing feature vectors for images self.negCache = [np.empty((0, )) for _ in range(self.Qidx.shape[0])]
def dbscan(self, X, eps=0.5, min_samples=5, metric='minkowski', metric_params=None, algorithm='auto', leaf_size=30, p=2, sample_weight=None, n_jobs=None): """Perform DBSCAN clustering from vector array or distance matrix. Read more in the :ref:`User Guide <dbscan>`. Parameters ---------- X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \ array of shape (n_samples, n_samples) A feature array, or array of distances between samples if ``metric='precomputed'``. eps : float, optional The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. This is the most important DBSCAN parameter to choose appropriately for your data set and distance function. min_samples : int, optional The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself. metric : string, or callable The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by :func:`sklearn.metrics.pairwise_distances` for its metric parameter. If metric is "precomputed", X is assumed to be a distance matrix and must be square. X may be a sparse matrix, in which case only "nonzero" elements may be considered neighbors for DBSCAN. metric_params : dict, optional Additional keyword arguments for the metric function. .. versionadded:: 0.19 algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional The algorithm to be used by the NearestNeighbors module to compute pointwise distances and find nearest neighbors. See NearestNeighbors module documentation for details. leaf_size : int, optional (default = 30) Leaf size passed to BallTree or cKDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem. p : float, optional The power of the Minkowski metric to be used to calculate distance between points. sample_weight : array, shape (n_samples,), optional Weight of each sample, such that a sample with a weight of at least ``min_samples`` is by itself a core sample; a sample with negative weight may inhibit its eps-neighbor from being core. Note that weights are absolute, and default to 1. n_jobs : int or None, optional (default=None) The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary <n_jobs>` for more details. Returns ------- core_samples : array [n_core_samples] Indices of core samples. labels : array [n_samples] Cluster labels for each point. """ if not eps > 0.0: raise ValueError("eps must be positive.") X = check_array(X, accept_sparse='csr') if sample_weight is not None: sample_weight = np.asarray(sample_weight) check_consistent_length(X, sample_weight) # Calculate neighborhood for all samples. This leaves the original point # in, which needs to be considered later (i.e. point i is in the # neighborhood of point i. While True, its useless information) if metric == 'precomputed' and sparse.issparse(X): neighborhoods = np.empty(X.shape[0], dtype=object) X.sum_duplicates() # XXX: modifies X's internals in-place # set the diagonal to explicit values, as a point is its own neighbor with warnings.catch_warnings(): warnings.simplefilter('ignore', sparse.SparseEfficiencyWarning) X.setdiag(X.diagonal()) # XXX: modifies X's internals in-place X_mask = X.data <= eps masked_indices = X.indices.astype(np.intp, copy=False)[X_mask] masked_indptr = np.concatenate(([0], np.cumsum(X_mask))) masked_indptr = masked_indptr[X.indptr[1:-1]] # split into rows neighborhoods[:] = np.split(masked_indices, masked_indptr) else: neighbors_model = NearestNeighbors(radius=eps, algorithm=algorithm, leaf_size=leaf_size, metric=metric, metric_params=metric_params, p=p, n_jobs=n_jobs) neighbors_model.fit(X) # This has worst case O(n^2) memory complexity neighborhoods = neighbors_model.radius_neighbors( X, eps, return_distance=False) if sample_weight is None: n_neighbors = np.array( [len(neighbors) for neighbors in neighborhoods]) else: n_neighbors = np.array([ np.sum(sample_weight[neighbors]) for neighbors in neighborhoods ]) # Initially, all samples are noise. labels = np.full(X.shape[0], -1, dtype=np.intp) # A list of all core samples found. core_samples = np.asarray(n_neighbors >= min_samples, dtype=np.uint8) dbscan_inner(core_samples, neighborhoods, labels) return np.where(core_samples)[0], labels
class GlobalKMeans(BaseEstimator, ClusterMixin, TransformerMixin): """Global K-means Algorithm Paramereters: n_clusters: int maximum number of clusters to obtain algorithm string 'classical' the classical algorithm 'bagirov' the Bagirov 2006 variant """ def __init__(self, n_clusters, algorithm='classical'): self.n_clusters = n_clusters self.cluster_centers_ = None self.labels_ = None self.cluster_sizes_ = None self.inertia_ = None self.algorithm = algorithm def fit(self, X): """ Clusters the examples :param X: :return: """ if self.algorithm == 'classical': self.cluster_centers_, self.labels_, self.inertia_ = self._fit_process( X) elif self.algorithm == 'bagirov': self.cluster_centers_, self.labels_, self.inertia_ = self._fit_process_bagirov( X) return self def predict(self, X): """ Returns the nearest cluster for a data matrix @param X: @return: """ clasif = [] for i in range(X.shape[0]): ncl, mdist = self._find_nearest_cluster(X[i].reshape(1, -1), self.cluster_centers_) if mdist <= self.radius: clasif.append(ncl) else: clasif.append(-1) return clasif def _fit_process(self, X): """ Classical global k-means algorithm :param X: :return: """ # Compute the centroid of the dataset centroids = sum(X) / X.shape[0] centroids.shape = (1, X.shape[1]) for i in range(2, self.n_clusters + 1): mininertia = np.infty for j in range(X.shape[0]): newcentroids = np.vstack((centroids, X[j])) #print newcentroids.shape km = KMeans(n_clusters=i, init=newcentroids, n_init=1) km.fit(X) if mininertia > km.inertia_: mininertia = km.inertia_ bestkm = km centroids = bestkm.cluster_centers_ return bestkm.cluster_centers_, bestkm.labels_, bestkm.inertia_ def _fit_process_bagirov(self, X): """ Clusters using the global K-means algorithm Bagirov variation :param X: :return: """ # Create a KNN structure for fast search self._neighbors = NearestNeighbors() self._neighbors.fit(X) # Compute the centroid of the dataset centroids = sum(X) / X.shape[0] assignments = [0 for i in range(X.shape[0])] centroids.shape = (1, X.shape[1]) # compute the distance of the examples to the centroids mindist = np.zeros(X.shape[0]) for i in range(X.shape[0]): mindist[i] = euclidean_distances(X[i].reshape(1, -1), centroids[assignments[i]].reshape( 1, -1), squared=True)[0] for k in range(2, self.n_clusters + 1): newCentroid = self._compute_next_centroid(X, centroids, assignments, mindist) centroids = np.vstack((centroids, newCentroid)) km = KMeans(n_clusters=k, init=centroids, n_init=1) km.fit(X) assignments = km.labels_ for i in range(X.shape[0]): mindist[i] = euclidean_distances( X[i].reshape(1, -1), centroids[assignments[i]].reshape(1, -1), squared=True)[0] return km.cluster_centers_, km.labels_, km.inertia_ def _compute_next_centroid(self, X, centroids, assignments, mindist): """ Computes the candidate for the next centroid :param X: :param centroids: :return: """ minsum = np.infty candCentroid = None # Compute the first candidate to new centroid for i in range(X.shape[0]): distance = euclidean_distances( X[i].reshape(1, -1), centroids[assignments[i]].reshape(1, -1))[0] S2 = self._neighbors.radius_neighbors(X[i].reshape(1, -1), radius=distance, return_distance=False)[0] S2centroid = np.sum(X[S2], axis=0) / len(S2) S2centroid.shape = (1, X.shape[1]) cost = self._compute_fk(X, mindist, S2centroid) if cost < minsum: minsum = cost candCentroid = S2centroid # Compute examples for the new centroid S2 = [] newDist = euclidean_distances(X, candCentroid.reshape(1, -1), squared=True) for i in range(X.shape[0]): if newDist[i] < mindist[i]: S2.append(i) newCentroid = sum(X[S2]) / len(S2) newCentroid.shape = (1, X.shape[1]) while not (candCentroid == newCentroid).all(): candCentroid = newCentroid S2 = [] newDist = euclidean_distances(X, candCentroid.reshape(1, -1), squared=True) for i in range(X.shape[0]): if newDist[i] < mindist[i]: S2.append(i) newCentroid = np.sum(X[S2], axis=0) / len(S2) newCentroid.shape = (1, X.shape[1]) return candCentroid def _compute_fk(self, X, mindist, ccentroid): """ Computes the cost function :param X: :param mindist: :param ccentroid: :return: """ # Distances among the examples and the candidate centroid centdist = euclidean_distances(X, ccentroid.reshape(1, -1), squared=True) fk = 0 for i in range(X.shape[0]): fk = fk + min(mindist[i], centdist[i][0]) return fk @staticmethod def _find_nearest_cluster(examp, centers): """ Finds the nearest cluster for an example :param examp: :param centers: :return: """ dist = euclidean_distances(centers, examp.reshape(1, -1)) pmin = np.argmin(dist) vmin = np.min(dist) return pmin, vmin
def test_radius_neighbors(): # Checks whether Returned distances are less than `radius` # At least one point should be returned when the `radius` is set # to mean distance from the considering point to other points in # the database. # Moreover, this test compares the radius neighbors of LSHForest # with the `sklearn.neighbors.NearestNeighbors`. n_samples = 12 n_features = 2 n_iter = 10 rng = np.random.RandomState(42) X = rng.rand(n_samples, n_features) lshf = LSHForest() # Test unfitted estimator assert_raises(ValueError, lshf.radius_neighbors, X[0]) ignore_warnings(lshf.fit)(X) for i in range(n_iter): # Select a random point in the dataset as the query query = X[rng.randint(0, n_samples)].reshape(1, -1) # At least one neighbor should be returned when the radius is the # mean distance from the query to the points of the dataset. mean_dist = np.mean(pairwise_distances(query, X, metric='cosine')) neighbors = lshf.radius_neighbors(query, radius=mean_dist, return_distance=False) assert_equal(neighbors.shape, (1,)) assert_equal(neighbors.dtype, object) assert_greater(neighbors[0].shape[0], 0) # All distances to points in the results of the radius query should # be less than mean_dist distances, neighbors = lshf.radius_neighbors(query, radius=mean_dist, return_distance=True) assert_array_less(distances[0], mean_dist) # Multiple points n_queries = 5 queries = X[rng.randint(0, n_samples, n_queries)] distances, neighbors = lshf.radius_neighbors(queries, return_distance=True) # dists and inds should not be 1D arrays or arrays of variable lengths # hence the use of the object dtype. assert_equal(distances.shape, (n_queries,)) assert_equal(distances.dtype, object) assert_equal(neighbors.shape, (n_queries,)) assert_equal(neighbors.dtype, object) # Compare with exact neighbor search query = X[rng.randint(0, n_samples)].reshape(1, -1) mean_dist = np.mean(pairwise_distances(query, X, metric='cosine')) nbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X) distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist) distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist) # Radius-based queries do not sort the result points and the order # depends on the method, the random_state and the dataset order. Therefore # we need to sort the results ourselves before performing any comparison. sorted_dists_exact = np.sort(distances_exact[0]) sorted_dists_approx = np.sort(distances_approx[0]) # Distances to exact neighbors are less than or equal to approximate # counterparts as the approximate radius query might have missed some # closer neighbors. assert_true(np.all(np.less_equal(sorted_dists_exact, sorted_dists_approx)))
algorithm='auto', n_jobs=-1) NearestNeighborsModel.fit(X_train) #---------------------------------------------------- #Calculating Details print('The distance metric to use is : ', NearestNeighborsModel.effective_metric_) print('Additional keyword arguments for the metric function is : ', NearestNeighborsModel.effective_metric_params_) print('Number of samples in the fitted data is : ', NearestNeighborsModel.n_samples_fit_) print("=" * 10) print('NearestNeighborsModel Train kneighbors are : ', NearestNeighborsModel.kneighbors(X_train[:1])) print('NearestNeighborsModel Train radius kneighbors are : ', NearestNeighborsModel.radius_neighbors(X_train[:1])) print("=" * 10) print('NearestNeighborsModel Test kneighbors are : ', NearestNeighborsModel.kneighbors(X_test[:1])) print('NearestNeighborsModel Test radius kneighbors are : ', NearestNeighborsModel.radius_neighbors(X_test[:1])) print("=" * 25) #---------------------------------------------------- plt.figure("data") sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=y, alpha=1, palette=['r', 'b', 'k']) sns.scatterplot(x=centers[:, 0], y=centers[:, 1], s=75, color="yellow", label="Centers") plt.show(block=False)
id,x1,y1,m,x2,y2 = np.genfromtxt('%s.dat'%(i+1),unpack=True,usecols=(0,3,4,5,9,10)) mask = (m<15)*(m>11) id,x1,y1,m,x2,y2 = np.transpose([id,x1,y1,m,x2,y2])[mask].T epinbu = np.in1d(id,bid) buinep = np.in1d(bid,id) print (id[epinbu]==bid[buinep]).sum(),id.size epxy = np.transpose([x2,y2])[epinbu] epm = m[epinbu] nbrs = NN(n_neighbors=vecinos, algorithm='auto').fit(epxy) if radio: dist, idx = nbrs.radius_neighbors(np.transpose([x2,y2]),radius=400) nbors = np.array([len(d) for d in dist]) mednbors, minnbors = np.median(nbors),nbors.min() for j in range(len(dist)): #Elimina la misma estrella msk = dist[j]>300 dist[j] = dist[j][msk] idx[j] = idx[j][msk] #Toma las 50 mas brillantes ''' if len(dist[j])>15: midx = np.argsort(epm[idx[j]])[:15]
def pms(substack, args): """Find cells using mean shift. In this version, the substack is split into eight patches. Parameters ---------- substack : object :class:`bcfind.volume.SubStack` object representing the substack to be analyzed. args : object :py:class:`argparse.Namespace` object containing the arguments passed to the find_cells script, in particular - args.outdir: directory where results are saved - args.hi_local_max_radius: radius of the sphere used to decide whether a local maximum should be a seed - args.mean_shift_bandwidth: bandwidth for the mean shift algorithm - args.floating_point: bool, whether cell coordinates should be rounded before saving """ D = substack.info['Depth'] W = substack.info['Width'] H = substack.info['Height'] M = 20 patch = np.zeros((W,H,D)) for z in range(D): patch[:,:,z] = np.array(substack.imgs[z]).T slicesx = [slice(0, W/2+M), slice(W/2-M,W)] slicesy = [slice(0, H/2+M), slice(H/2-M,H)] slicesz = [slice(0, D/2+M), slice(D/2-M,D)] cluster_centers = np.zeros((0,3)) cluster_masses = np.zeros(0) L = np.zeros((0,3)) labels = np.zeros(0) seeds = [] counter = 0 for sx in slicesx: for sy in slicesy: for sz in slicesz: counter += 1 tee.log('%d/8:' % counter, 'Analyzing minisubstack',sx,sy,sz) rval = _patch_ms(patch[sx,sy,sz], args) origin = [sx.start,sy.start,sz.start] if rval is not None: cluster_centers = np.concatenate((cluster_centers, rval.cluster_centers + origin)) cluster_masses = np.concatenate((cluster_masses, rval.masses)) labels = np.concatenate((labels, rval.labels+len(rval.cluster_centers))) L = np.concatenate((L,rval.L+origin)) for c in rval.seeds: c.x += origin[0] c.y += origin[1] c.z += origin[2] seeds.extend(rval.seeds) if len(cluster_centers) > 0: # remove near duplicate points (because of overlapping margins) indices = np.argsort(cluster_masses) sorted_centers = cluster_centers[indices] sorted_masses = cluster_masses[indices] # sorted_volumes = volumes[indices] unique = np.ones(len(sorted_centers), dtype=np.bool) # FIXME - make it a parameter nbrs = NearestNeighbors(radius=5.5).fit(sorted_centers) for i, center in enumerate(sorted_centers): if unique[i]: neighbor_idxs = nbrs.radius_neighbors([center], return_distance=False)[0] unique[neighbor_idxs] = 0 unique[i] = 1 # leave the current point as unique cluster_centers = sorted_centers[unique] masses = sorted_masses[unique] masses_mean = np.mean(masses) masses_std = np.std(masses) # volumes = sorted_volumes[unique] C = [] for i, cc in enumerate(cluster_centers): c = volume.Center(cc[0], cc[1], cc[2]) c.name = 'MS_center %d' % i c.volume = (masses[i]-masses_mean)/masses_std # volumes[i] c.mass = masses[i] tee.log(i, cc, c) C.append(c) filename = args.outdir+'/ms.marker' substack.save_markers(filename, C, floating_point=args.floating_point) tee.log('Markers saved to', filename) filename = args.outdir+'/seeds.marker' substack.save_markers(filename, seeds) tee.log(len(seeds), 'seeds saved to', filename) up_outdir=dirname(abspath(args.outdir)) if args.save_image: image_saver = volume.ImageSaver(up_outdir, substack, C) Lx = [int(x) for x in L[:,0]] Ly = [int(y) for y in L[:,1]] Lz = [int(z) for z in L[:,2]] image_saver.save_above_threshold(Lx, Ly, Lz) tee.log('Debugging images saved in',up_outdir) else: tee.log('Debugging images not saved')
def plan_cached_rrt(self,cached_points): """ Plan cached RRT """ print "Planning a cached RRT*" marker_points = MarkerArray() vol_freecells = len(self._freecells)*self._navmap.info.resolution**2 print "FREE CELL VOLUME", vol_freecells gamma_rrg = 2*sqrt(1.5*vol_freecells/pi) probot = np.array([self._robot_pose.pose.position.x,self._robot_pose.pose.position.y,2*np.arccos(self._robot_pose.pose.orientation.w)]) nbrs = NearestNeighbors(n_neighbors=1,algorithm="kd_tree",leaf_size = 30) # V is a list of edges. E is a disctionary where the key is connected with its values. # parents is a disctionary where parent of key is value. Since is a key, each node has only one parent V = [probot] #V_xy = [probot[:2]] p4dist = np.zeros(4) p4dist[:2] = probot[:2];p4dist[2] = np.cos(probot[2]);p4dist[3] = np.sin(probot[2]) V_xy = [p4dist] E = {} parents = {} Dist = [0.0] # C stores the cost at vertex idx which is hte sum of the edges going to it. goal_xy = np.array([self._goal.pose.position.x,self._goal.pose.position.y,2*np.arccos(self._goal.pose.orientation.w)]) c_init = self.cost_manager.get_cost(probot[:2],goal_xy[:2]) edge_C = {} #flann = FLANN() lowest_cost_idx = None #params = flann.build_index(np.array(V)) #pdb.set_trace() t1 = time.time() planning_done = False rrt_iter = 0 while not planning_done: t2 = time.time() """ Sampling new point """ cached = cached_points[rrt_iter] prand = cached["prand"] pnearest_idx = cached["pnearest_idx"] pnearest = V[pnearest_idx] """ Turning new point into reachable point """ pnew =cached["pnew"] path_new = cached["path_new"] Pnear_idx = cached["Pnear_idx"] pmin_idx = pnearest_idx cum_c = self.integrate_costs(edge_C,parents,pnearest_idx) min_edge_c = self.cost_manager.path_cost(path_new,goal_xy[:2]) cmin = cum_c +min_edge_c cumulative_costs = {} Pnear_fwd = cached["Pnear_forward"] for num,p_idx in enumerate(Pnear_fwd): p = V_xy[p_idx] p_xyz = V[p_idx] cum_cost = self.integrate_costs(edge_C,parents,p_idx) cumulative_costs[p_idx] = cum_cost p_idx_path = cached["pnear_pnew"][num]["path"] reached = cached["pnear_pnew"][num]["reached"] safe = cached["pnear_pnew"][num]["safe"] if reached is True and safe is True: path_c = self.cost_manager.path_cost(p_idx_path,goal_xy[:2]) else: path_c = 0 #reached = False c = cum_cost + path_c if (safe is True and reached is True and c < cmin): cmin = c min_edge_c = path_c pmin_idx = p_idx if E.has_key(pmin_idx): E[pmin_idx].add(len(V)) else: E[pmin_idx] = set([len(V)]) edge_C[pmin_idx,len(V)] = min_edge_c cumulative_last = cmin pnew_idx = len(V) V.append(pnew) p4dist = np.zeros(4) p4dist[:2] = pnew[:2];p4dist[2] = np.cos(pnew[2]);p4dist[3] = np.sin(pnew[2]) V_xy.append(p4dist) parents[pnew_idx] = pmin_idx """ Re-wire the tree """ unsafe = 0 Pnear_bwd = cached["Pnear_backward"] for en,p_idx in enumerate(Pnear_bwd): if parents.has_key(p_idx): if not cumulative_costs.has_key(p_idx): cumulative_costs[p_idx] = self.integrate_costs(edge_C,parents,p_idx) p_xyz = V[p_idx] rewire_path = cached["pnew_pnear"][en]["path"] rewire_reached = cached["pnew_pnear"][en]["reached"] rewire_safe = cached["pnew_pnear"][en]["safe"] #rewire_path,rewire_reached = posq.simulate(pnew,p_xyz,steps = int(stp)) if rewire_reached is True and rewire_safe is True : rewire_path_c = self.cost_manager.path_cost(rewire_path,goal_xy[:2]) else: rewire_path_c = 0 c = cumulative_last + rewire_path_c if (rewire_safe is True and c < cumulative_costs[p_idx] and rewire_reached is True): E[parents[p_idx]].remove(p_idx) edge_C.pop(parents[p_idx],p_idx) edge_C[pnew_idx,p_idx] = rewire_path_c parents[p_idx] = pnew_idx if E.has_key(pnew_idx): E[pnew_idx].add(p_idx) else: E[pnew_idx] = set([p_idx]) rrt_iter +=1 if rrt_iter==len(cached_points): planning_done=True nbrs.fit(V_xy) p4dist = np.zeros(4) p4dist[:2] = goal_xy[:2];p4dist[2] = np.cos(goal_xy[2]);p4dist[3] = np.sin(goal_xy[2]) points_near_goal = [] add = 0 while len(points_near_goal)==0: dist,points_near_goal = nbrs.radius_neighbors(p4dist, self.goal_tolerance+add, return_distance = True) points_near_goal = points_near_goal[0] add +=0.1 print "DONE PLANNING" print "TIME TAKEN",time.time()-t1 print "POINTS NEAR GOAL",points_near_goal #self.samp_point_pub.publish(marker_points) """ Find best path: """ min_cost = None; for i in points_near_goal: c_path = self.integrate_costs(edge_C,parents,i) if c_path < min_cost or min_cost==None: m = i min_cost = c_path self.publish_rrt(V,E) print "MINIMUM PATH COST RRT",min_cost path = self.get_path(parents,V,m) pt = path_to_pose(path) print 'total time: ', time.time()-t1 self._path_pub.publish(pt) return pt,path
def detect_and_filter_keypoints(im_gray, corner_type='HARRIS'): image = cv2.cvtColor(im_gray, cv2.COLOR_GRAY2BGR) image = cv2.GaussianBlur(image, (5, 5), 0) corners = None if corner_type is 'HARRIS': corners = cv2.goodFeaturesToTrack(im_gray, 400, 0.005, 5, useHarrisDetector=True) if corners is None: return corners = np.int0(corners) elif corner_type is 'HOUGH_LINES': im_edge = cv2.Canny(im_gray, 100, 200) corners = detect_lines(im_edge, im_gray) temp_corners = [] ground_z = 0.0 for i in corners: x,y = i.ravel() a00 = x * proj_matrix_[2, 0] - proj_matrix_[0, 0] a01 = x * proj_matrix_[2, 1] - proj_matrix_[0, 1] a10 = y * proj_matrix_[2, 0] - proj_matrix_[1, 0] a11 = y * proj_matrix_[2, 1] - proj_matrix_[1, 1] bv0 = proj_matrix_[0, 2] * ground_z + proj_matrix_[0, 3] - \ x * proj_matrix_[2, 2] * ground_z - x * proj_matrix_[2, 3] bv1 = proj_matrix_[1, 2] * ground_z + proj_matrix_[1, 3] - \ y * proj_matrix_[2, 2] * ground_z - y * proj_matrix_[2, 3] partition = a11 * a00 - a01 * a10 pos_x = (a11 * bv0 - a01 * bv1) / partition pos_y = (a00 * bv1 - a10 * bv0) / partition temp_corners.append((pos_x, pos_y, ground_z)) #temp_corners.append((x,y)) cv2.circle(image, (x, y), 3, (0, 0, 255), -1) #find the neigbours radius_thresh = 6.0 #n_neighbors = 4, nearest_neigb = NearestNeighbors(radius = radius_thresh, \ algorithm="kd_tree", \ leaf_size = 30, \ metric = 'euclidean').fit(np.array(temp_corners)) #distances, indices = nearest_neigb.kneighbors(np.array(temp_corners)) distances, indices = nearest_neigb.radius_neighbors(np.array(temp_corners)) temp_im = image.copy() possible_candidate = [] for distance, index in zip(distances, indices): # fix to not repeat on done array centeroid_index = -1 for dist, idx in zip(distance, index): if dist == 0: centeroid_index = idx break #print "\033[34m CENTROID: \033[0m", centeroid_index #cv2.circle(image, (corners[centeroid_index][0][0],\ # corners[centeroid_index][0][1]), 5, (0, 255, 0), 2) #print "INDEX: ", index, "\t", np.average(distance), "\t", distance #avg_dist = np.average(distance) #if (avg_dist > 2.5 and avg_dist < 5): for dist, idx in zip(distance, index): angle = vector_angle(temp_corners[centeroid_index], temp_corners[idx]) * (180.0/np.pi) if (dist > (radius_thresh/2) and dist < radius_thresh) and(angle > 75 and angle < 180): possible_candidate.append(idx) # print "ANGLE: ", angle, "\t", dist, "\t", idx # cv2.circle(image, (corners[idx][0][0], corners[idx][0][1]), 3, (255, 255, 0), -1) # plot_image("edge1", image) # cv2.waitKey(0) # image = temp_im.copy() #print (possible_candidate) centroid_point = [] if len(possible_candidate) > 3: aver_x = 0 aver_y = 0 for pc in possible_candidate: cv2.circle(image, (corners[pc][0][0], corners[pc][0][1]), 4, (255, 255, 0), 1) aver_x += corners[pc][0][0] aver_y += corners[pc][0][1] aver_x /= len(possible_candidate) aver_y /= len(possible_candidate) centroid_point.append((aver_x, aver_y)) cv2.circle(image, (aver_x, aver_y), 7, (0, 255, 0), -1) plot_image("edge1", image) return (np.array(possible_candidate), np.array(centroid_point))
def continuity_filter(wood, leaf, rad=0.05): """ Function to apply a continuity filter to a point cloud that contains gaps defined as points from a second point cloud. This function works assuming that the continuous variable is the wood portion of a tree point cloud and the gaps in it are empty space or missclassified leaf data. In this sense, this function tries to correct gaps where leaf points are present. Args: wood (array): Wood point cloud to be filtered. leaf (array): Leaf point cloud, with points that may be causing discontinuities in the wood point cloud. rad (float): Radius to search for neighboring points in the iterative process. Returns: wood (array): Filtered wood point cloud. not_wood (array): Remaining point clouds after the filtering. """ # Stacking wood and leaf arrays. arr = np.vstack((wood, leaf)) # Obtaining wood point cloud indices. wood_id = np.arange(wood.shape[0]) # Calculating shortest path graph over sampled array. G = array_to_graph(arr, 0, 3, 100, 0.05, 0.02, 0.5) _, dist = extract_path_info(G, 0, return_path=False) # Generating nearest neighbors search for the entire point cloud (arr). nbrs = NearestNeighbors(algorithm='kd_tree', leaf_size=10, n_jobs=-1).fit(arr) # Converting dist variable to array, as it is originaly a list. dist = np.asarray(dist) # Selecting points and accummulated distance for all wood points in arr. gp = arr[wood_id] d = dist[wood_id] # Preparing control variables to iterate over. idbase will be all initial # wood ids and pts all initial wood points. These variables are the ones # to use in search of possible missclassified neighbors. idbase = wood_id pts = gp # Setting treshold variables to iterative process. e = 9999999 e_threshold = 3 # Iterating until threshold is met. while e > e_threshold: # Obtaining the neighbor indices of current set of points (pts). idx2 = nbrs.radius_neighbors(pts, radius=rad, return_distance=False) # Initializing temporary variable id1. id1 = [] # Looping over nn search indices and comparing their respective # distances to center point distance. If nearest neighbor distance (to # point cloud base) is smaller than center point distance, then ith # point is also wood. for i in range(idx2.shape[0]): for i_ in idx2[i]: if dist[i_] <= (d[i]): id1.append(i_) # Uniquifying id1. id1 = np.unique(id1) # Comparing original idbase to new wood ids (id1). comp = np.in1d(id1, idbase) # Maintaining only new ids for next iteration. diff = id1[np.where(~comp)[0]] idbase = np.unique(np.hstack((idbase, id1))) # Passing new wood points to pts and recalculating e value. pts = arr[diff] e = pts.shape[0] # Passing accummulated distances from new points to d. d = dist[diff] # Stacking new points to initial wood points and removing duplicates. gp = np.vstack((gp, pts)) gp = remove_duplicates(gp) # Removing duplicates from final wood points and obtaining not_wood points # from the difference between final wood points and full point cloud. wood = remove_duplicates(gp) not_wood = get_diff(wood, arr) return wood, not_wood
def match_by_neighbor(self, caliper): ''' Performs propensity score matching. Parameters ---------- caliper : the attribute returned by the set_caliper() function Returns ------- matched_controls : Pandas DataFrame unmatched: Set under_matched: Set ''' ignore_list = set() under_matched = set() unmatched = set() matched_controls = pd.DataFrame() ratio = self.k data = self.df # convert data type data.INDEX_DATE = pd.to_datetime(data.INDEX_DATE) data.CASE = data.CASE.astype(int) controls = data[data.CASE == 0] cases = data[data.CASE == 1] neigh = NearestNeighbors(radius=caliper, algorithm='ball_tree', n_jobs=1) neigh.fit(controls[['PSCORE']]) # calculate time i = 1 total_cases = cases.shape[0] start = timeit.default_timer() #loop through each case for index, case in cases.iterrows(): # case index date case_indexdate = cases[cases.PATID == case['PATID']].INDEX_DATE.values[0] # current case's pscore pscore = case.PSCORE # find all controls with pscore within the caliper distance distances, indices = neigh.radius_neighbors([[pscore]]) sample = controls.iloc[indices[0]] # pick out those that have NOT been used sample = sample[~sample['PATID'].isin(ignore_list)].copy() ## verify index date for control sample['INDEX_DATE_GAP'] = abs(sample.INDEX_DATE - case_indexdate) / np.timedelta64(1, 'D') sample = sample[sample.INDEX_DATE_GAP <= self.gap].sort_values(by=['PATID', 'INDEX_DATE_GAP']) # rank the samples by their distances to the case's pscore sample['DIST'] = abs(sample['PSCORE']-pscore) sample.sort_values(by='DIST', ascending=True, inplace=True) # picked the closest "ratio" sample = sample.head(ratio).copy().reset_index(drop=True) if (sample.shape[0] < ratio and sample.shape[0] != 0): under_matched.add(case['PATID']) if (sample.shape[0] == 0): unmatched.add(case['PATID']) # exclude the selected sample from the matching pool (i.e., without replacement) ignore_list.update(sample['PATID']) sample['MATCHED_CASE'] = case['PATID'] sample['MATCHED_CASE_INDEX_DATE'] = case_indexdate matched_controls = matched_controls.append(sample, ignore_index=True) # track progress stop = timeit.default_timer() print("Current progress:", np.round(i/total_cases * 100, 2), "%") print("Current run time:", np.round((stop - start) / 60, 2), "min") i = i+1 matched_controls = matched_controls.reset_index(drop=True) cases = find_case(data, unmatched) self.matched_controls = matched_controls self.unmatched = unmatched write_matched_data(self.path, cases, matched_controls) return under_matched, unmatched, matched_controls
import numpy as np from sklearn.neighbors import NearestNeighbors ''' NearestNeighbors(n_neighbors=5, radius=1.0, algorithm=’auto’, leaf_size=30, metric=’minkowski’, p=2, metric_ params=None, n_jobs=None, **kwargs) ''' data = np.array([[1, 1], [1, 2], [2, 1], [2, 3], [1, 5], [6, 8], [7, 9], [6, 9], [8, 8], [8, 10], [14, 1], [14, 2], [15, 1], [15, 3]]) nearest_neighbors_model = NearestNeighbors(n_neighbors=5, algorithm='auto', radius=1.0) nearest_neighbors_model.fit(data) #Calculating Details print('NearestNeighborsModel kneighbors are : ', nearest_neighbors_model.kneighbors(data)) print("-------------------------------------------------------") print('NearestNeighborsModel radius kneighbors are : ', nearest_neighbors_model.radius_neighbors(data))
def get_neighbors(self, pts, radius): pts = self.transform_points(pts) radius = radius * self.pixels_per_meter nn = NearestNeighbors(n_neighbors=5, radius=radius, p=2) nn.fit(pts) return nn.radius_neighbors(pts, return_distance=False)
def mean_shift_cosine(X, bandwidth=None, seeds=None, cluster_all=True, GPU=True): """Perform mean shift clustering of data using a flat kernel. Read more in the :ref:`User Guide <mean_shift>`. Parameters ---------- X : array-like, shape=[n_samples, n_features] Input data. bandwidth : float, optional Kernel bandwidth. If bandwidth is not given, it is determined using a heuristic based on the median of all pairwise distances. This will take quadratic time in the number of samples. The sklearn.cluster.estimate_bandwidth function can be used to do this more efficiently. seeds : array-like, shape=[n_seeds, n_features] or None Point used as initial kernel locations. cluster_all : boolean, default True If true, then all points are clustered, even those orphans that are not within any kernel. Orphans are assigned to the nearest kernel. If false, then orphans are given cluster label -1. GPU : bool, default True Using GPU-based faster mean-shift Returns ------- cluster_centers : array, shape=[n_clusters, n_features] Coordinates of cluster centers. labels : array, shape=[n_samples] Cluster labels for each point. """ if bandwidth is None: bandwidth = estimate_bandwidth(X) elif bandwidth <= 0: raise ValueError("bandwidth needs to be greater than zero or None,\ got %f" % bandwidth) if seeds is None: if GPU == True: seeds = gpu_seed_generator(X) #adjusted=False n_samples, n_features = X.shape center_intensity_dict = {} nbrs = NearestNeighbors(radius=bandwidth, metric='cosine').fit(X) #NearestNeighbors(radius=bandwidth, n_jobs=n_jobs, metric='cosine').radius_neighbors() global SEED_NUM if GPU == True: #GPU ver while True: labels, number = meanshift_torch(X, seeds, bandwidth) #gpu calculation for i in range(len(number)): if number[i] is not None: center_intensity_dict[tuple( labels[i])] = number[i] #find out cluster if not center_intensity_dict: # nothing near seeds raise ValueError( "No point was within bandwidth=%f of any seed." " Try a different seeding strategy \ or increase the bandwidth." % bandwidth) # POST PROCESSING: remove near duplicate points # If the distance between two kernels is less than the bandwidth, # then we have to remove one because it is a duplicate. Remove the # one with fewer points. sorted_by_intensity = sorted(center_intensity_dict.items(), key=lambda tup: (tup[1], tup[0]), reverse=True) sorted_centers = np.array([tup[0] for tup in sorted_by_intensity]) unique = np.ones(len(sorted_centers), dtype=np.bool) nbrs = NearestNeighbors(radius=bandwidth, metric='cosine').fit(sorted_centers) for i, center in enumerate(sorted_centers): if unique[i]: neighbor_idxs = nbrs.radius_neighbors( [center], return_distance=False)[0] unique[neighbor_idxs] = 0 unique[i] = 1 # leave the current point as unique cluster_centers = sorted_centers[unique] # assign labels nbrs = NearestNeighbors(n_neighbors=1, metric='cosine').fit(cluster_centers) labels = np.zeros(n_samples, dtype=np.int) distances, idxs = nbrs.kneighbors(X) if cluster_all: labels = idxs.flatten() else: labels.fill(-1) bool_selector = distances.flatten() <= bandwidth labels[bool_selector] = idxs.flatten()[bool_selector] #Test #break bg_num = np.sum(labels == 0) r = 1 - bg_num / labels.size #seed number adjust dict_len = len(cluster_centers) #cluster number N = get_N(0.95, r, dict_len) if L * N <= SEED_NUM: #safety area #SEED_NUM -= 200#test if H * N <= SEED_NUM: SEED_NUM -= N #seeds are too much, adjsut break else: seeds = gpu_seed_adjust(X) #seeds are too few, adjsut return cluster_centers, labels
def mean_shift(X, intensities=None, bandwidth=None, seeds=None, cluster_all=True, max_iterations=300, verbose=False, use_scipy=True): """mean_shift(X, intensities=None, bandwidth=None, seeds=None, cluster_all=True, max_iterations=300, verbose=False, use_scipy=True) Mean shift algorithm Implementation taken from scikit-learn with two minor variants: - Use (by default) scipy KD-trees, which are faster in our case - weigthed version of mean-shift using `intensities` as weights (i.e., we compute centers of mass rather than means) Parameters ---------- X : array-like, shape=[n_samples, n_features] Input data. intensities : array-like, shape=[n_samples] Voxel intensities, used to weight the mean bandwidth : float Kernel bandwidth. seeds : array-like, shape=[n_seeds, n_features] Point used as initial kernel locations. use_scipy : bool If true use cKDTree from scipy.spatial, otherwise use NearestNeighbors from sklearn.neighbors Returns ------- cluster_centers : array, shape=[n_clusters, n_features] Coordinates of cluster centers. labels : array, shape=[n_samples] Cluster labels for each point. volumes : array, shape=[n_clusters] Volume of each cluster (# of points in the cluster) masses : array, shape=[n_clusters] Mass of each cluster (sum of intensities of points in the cluster). trajectories : list MS trajectories for debugging purposes. """ if seeds is None: seeds = X n_points, n_features = X.shape stop_thresh = 1e-3 * bandwidth # when mean has converged center_volume_dict = {} center_mass_dict = {} # tee.log('Fitting NearestNeighbors on', n_points, 'points') if use_scipy: kdtree = cKDTree(X) else: nbrs = NearestNeighbors(radius=bandwidth).fit(X) # For each seed, climb gradient until convergence or max_iterations trajectories = {} # for each seed, a list of points tee.log('Moving kernels for', len(seeds), 'seeds') pbar = pb.ProgressBar(widgets=['Moving %d seeds: ' % len(seeds), pb.Percentage()], maxval=len(seeds)).start() for seed_no, my_mean in enumerate(seeds): completed_iterations = 0 seed = my_mean trajectories[seed_no] = [] while True: # Find mean of points within bandwidth if use_scipy: i_nbrs = kdtree.query_ball_point(my_mean, r=bandwidth) else: i_nbrs = nbrs.radius_neighbors([my_mean], bandwidth, return_distance=False)[0] points_within = X[i_nbrs] if len(points_within) == 0: break # Depending on seeding strategy this condition may occur my_old_mean = my_mean # save the old mean if intensities is None: my_mean = np.mean(points_within, axis=0) else: my_mean = np.average(points_within, axis=0, weights=intensities[i_nbrs]) # If converged or at max_iterations, addS the cluster if extmath.norm(my_mean - my_old_mean) < stop_thresh or completed_iterations == max_iterations: center_volume_dict[tuple(my_mean)] = len(points_within) center_mass_dict[tuple(my_mean)] = sum(intensities[i_nbrs]) break completed_iterations += 1 trajectories[seed_no].append(my_mean) if verbose: print('seed', seed, '-->', my_mean, center_volume_dict[tuple(my_mean)], center_mass_dict[tuple(my_mean)], completed_iterations) pbar.update(seed_no+1) pbar.finish() # POST PROCESSING: remove near duplicate points # If the distance between two kernels is less than the bandwidth, # then we have to remove one because it is a duplicate. Remove the # one with fewer points. sorted_by_intensity = sorted(center_mass_dict.items(), key=lambda tup: tup[1], reverse=True) sorted_centers = np.array([tup[0] for tup in sorted_by_intensity]) unique = np.ones(len(sorted_centers), dtype=np.bool) print('started from', len(seeds), 'seeds, now |unique|=', len(unique)) # print('|center_mass_dict|=', len(center_mass_dict)) if len(center_mass_dict) == 0: tee.log('No valid seeds. Giving up') return None, None, None, None, None nbrs = NearestNeighbors(radius=bandwidth).fit(sorted_centers) for i, center in enumerate(sorted_centers): if unique[i]: neighbor_idxs = nbrs.radius_neighbors([center], return_distance=False)[0] unique[neighbor_idxs] = 0 unique[i] = 1 # leave the current point as unique cluster_centers = sorted_centers[unique] print('|cluster_centers|=', len(cluster_centers)) volumes = [0]*len(cluster_centers) masses = [0]*len(cluster_centers) for i, c in enumerate(cluster_centers): volumes[i] = center_volume_dict[tuple(c)] masses[i] = center_mass_dict[tuple(c)] # ASSIGN LABELS: a point belongs to the cluster that it is closest to nbrs = NearestNeighbors(n_neighbors=1).fit(cluster_centers) labels = np.zeros(n_points, dtype=np.int) distances, idxs = nbrs.kneighbors(X) if cluster_all: labels = idxs.flatten() else: labels[:] = -1 bool_selector = distances.flatten() <= bandwidth labels[bool_selector] = idxs.flatten()[bool_selector] return cluster_centers, labels, volumes, masses, trajectories
import numpy as np from sklearn.neighbors import NearestNeighbors samples = [[0, 0, 2], [1, 0, 0], [0, 0, 1]] neigh = NearestNeighbors(n_neighbors=2, radius=0.4) neigh.fit(samples) neigh.kneighbors([[0, 0, 1.3]], 2, return_distance=False) nbrs = neigh.radius_neighbors([[0, 0, 1.3]], 0.4, return_distance=False) np.asarray(nbrs[0][0])
def fit(self, X, y=None, sample_weight=None): """Perform common-nearest-neighbor clustering Cluster from features, or distance matrix. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features), or (n_samples, n_samples) Training instances to cluster, or distances between instances if `metric='precomputed'`. If a sparse matrix is provided, it will be converted into a sparse `csr_matrix`. sample_weight : array-like of shape (n_samples,), default=None Weight of each sample. Note, that this option is not fully supported at the moment. y : Ignored Not used, present here for API consistency by convention. Returns ------- self """ if LooseVersion(sklearn.__version__) < LooseVersion("0.23.0"): X = check_array(X, accept_sparse="csr") else: X = self._validate_data(X, accept_sparse="csr") if not self.eps > 0.0: raise ValueError("eps must be positive.") if sample_weight is not None: warnings.warn("Sample weights are not fully supported, yet.", UserWarning) if LooseVersion(sklearn.__version__) < LooseVersion("0.23.0"): sample_weight = np.asarray(sample_weight) check_consistent_length(X, sample_weight) else: sample_weight = _check_sample_weight(sample_weight, X) # Calculate neighborhood for all samples. This leaves the # original point in, which needs to be considered later # (i.e. point i is in the # neighborhood of point i). While True, its useless information if self.metric == "precomputed" and sparse.issparse(X): # set the diagonal to explicit values, as a point is its own # neighbor with warnings.catch_warnings(): warnings.simplefilter("ignore", sparse.SparseEfficiencyWarning) X.setdiag(X.diagonal()) neighbors_model = NearestNeighbors( radius=self.eps, algorithm=self.algorithm, leaf_size=self.leaf_size, metric=self.metric, metric_params=self.metric_params, p=self.p, n_jobs=self.n_jobs, ) neighbors_model.fit(X) # This has worst case O(n^2) memory complexity neighborhoods = neighbors_model.radius_neighbors(X, return_distance=False) if sample_weight is None: n_neighbors = np.array( [len(neighbors) for neighbors in neighborhoods]) else: n_neighbors = np.array([ np.sum(sample_weight[neighbors]) for neighbors in neighborhoods ]) # Initially, all samples are noise. labels = np.full(X.shape[0], -1, dtype=np.intp) # Account for self neighbour membership (self.min_samples + 2) corrected_min_samples = self.min_samples + 2 # Array tracking points qualified for similarity check core_candidates = np.asarray(n_neighbors >= corrected_min_samples) commonnn_inner(neighborhoods, labels, core_candidates, corrected_min_samples) self.labels_ = labels return self
def plan_cached_rrt(self,cached_points): """ RRT* Algorithm """ marker_points = MarkerArray() vol_freecells = len(self._freecells)*self._navmap.info.resolution**2 print "FREE CELL VOLUME", vol_freecells gamma_rrg = 2*sqrt(1.5*vol_freecells/pi) probot = np.array([self._robot_pose.pose.position.x,self._robot_pose.pose.position.y]) nbrs = NearestNeighbors(n_neighbors=1,algorithm="kd_tree",leaf_size = 30) # V is a list of edges. E is a disctionary where the key is connected with its values. # parents is a disctionary where parent of key is value. Since is a key, each node has only one parent V = [probot] E = {} parents = {} Dist = [0.0] # C stores the cost at vertex idx which is hte sum of the edges going to it. goal_xy = np.array([self._goal.pose.position.x,self._goal.pose.position.y]) c_init = self.cost_manager.get_cost(probot,goal_xy) edge_C = {} C = [c_init] #flann = FLANN() lowest_cost_idx = None #params = flann.build_index(np.array(V)) #pdb.set_trace() t1 = time.time() planning_done = False rrt_iter = 0 while not planning_done: t2 = time.time() #bias*=1.0025 # increase the goal bias as the RRT progresses """ Sampling new point """ cached = cached_points[rrt_iter] prand = cached["prand"] pnearest_idx = cached["pnearest_idx"] pnearest = V[pnearest_idx] #mrk = self.make_sample_marker(prand) #marker_points.markers.append(mrk) #print len(marker_points.markers) #self.samp_point_pub.publish(mrk) """ Turning new point into reachable point """ pnew =cached["pnew"] #Pnear_idx,pnear_dist = flann.nn_radius(pnew, r) Pnear_idx = cached["Pnear_idx"] pmin_idx = pnearest_idx c_nearest = C[pnearest_idx] c_new =self.cost_manager.get_cost(pnew,goal_xy) cum_c = self.integrate_costs(edge_C,parents,pnearest_idx) min_edge_c =self.cost_manager.edge_cost(c_nearest,c_new,pnearest,pnew) cmin = cum_c +min_edge_c #if len(Pnear_idx)>5: # Pnear_idx = Pnear_idx[:5] cumulative_costs = [] for p_idx in Pnear_idx: p = V[p_idx] c_near = C[p_idx] cum_cost = self.integrate_costs(edge_C,parents,p_idx) cumulative_costs.append(cum_cost) edge_c = self.cost_manager.edge_cost(c_near,c_new,p,pnew) c = cum_cost + edge_c if (self.segment_safe(p,pnew) is True and c < cmin): cmin = c min_edge_c = edge_c pmin_idx = p_idx if E.has_key(pmin_idx): E[pmin_idx].add(len(V)) else: E[pmin_idx] = set([len(V)]) edge_C[pmin_idx,len(V)] = min_edge_c cumulative_last = cmin pnew_idx = len(V) V.append(pnew) C.append(c_new) parents[pnew_idx] = pmin_idx """ Re-wire the tree """ for en,p_idx in enumerate(Pnear_idx): # so if the near nodes, have children #parent if parents.has_key(p_idx): p = V[p_idx] c_near = C[p_idx] e_c = self.cost_manager.edge_cost(c_near,c_new,p,pnew) c = cumulative_last + e_c if (self.segment_safe(p,pnew) is True and c < cumulative_costs[en]): E[parents[p_idx]].remove(p_idx) edge_C.pop(parents[p_idx],p_idx) edge_C[pnew_idx,p_idx] = e_c parents[p_idx] = pnew_idx if E.has_key(pnew_idx): E[pnew_idx].add(p_idx) else: E[pnew_idx] = set([p_idx]) rrt_iter +=1 if rrt_iter==len(cached_points): planning_done=True nbrs.fit(V) dist,points_near_goal = nbrs.radius_neighbors(goal_xy, self.goal_tolerance+0.1, return_distance = True) points_near_goal = points_near_goal[0] print "DONE PLANNING" print "TIME TAKEN",time.time()-t1 print "POINTS NEAR GOAL",points_near_goal #self.samp_point_pub.publish(marker_points) """ Find best path: """ min_cost = None; for i in points_near_goal: c_path = self.integrate_costs(edge_C,parents,i) if c_path < min_cost or min_cost==None: m = i min_cost = c_path path = self.get_path(parents,V,m) path = self.smoothing(path) pt = path_to_pose(path) print 'total time: ', time.time()-t1 self.publish_rrt(V,E) self._path_pub.publish(pt) return pt,path
class GlobalKMeans(BaseEstimator, ClusterMixin, TransformerMixin): """Global K-means Algorithm Paramereters: n_clusters: int maximum number of clusters to obtain algorithm string 'classical' the classical algorithm 'bagirov' the Bagirov 2006 variant """ def __init__(self, n_clusters, algorithm='classical'): self.n_clusters = n_clusters self.cluster_centers_ = None self.labels_ = None self.cluster_sizes_ = None self.inertia_ = None self.algorithm = algorithm def fit(self, X): """ Clusters the examples :param X: :return: """ if self.algorithm == 'classical': self.cluster_centers_, self.labels_, self.inertia_ = self._fit_process(X) elif self.algorithm == 'bagirov': self.cluster_centers_, self.labels_, self.inertia_ = self._fit_process_bagirov(X) return self def predict(self, X): """ Returns the nearest cluster for a data matrix @param X: @return: """ clasif = [] for i in range(X.shape[0]): ncl, mdist = self._find_nearest_cluster(X[i].reshape(1, -1), self.cluster_centers_) if mdist <= self.radius: clasif.append(ncl) else: clasif.append(-1) return clasif def _fit_process(self, X): """ Classical global k-means algorithm :param X: :return: """ # Compute the centroid of the dataset centroids = sum(X) / X.shape[0] centroids.shape = (1, X.shape[1]) for i in range(2, self.n_clusters + 1): mininertia = np.infty for j in range(X.shape[0]): newcentroids = np.vstack((centroids, X[j])) # print newcentroids.shape km = KMeans(n_clusters=i, init=newcentroids, n_init=1) km.fit(X) if mininertia > km.inertia_: mininertia = km.inertia_ bestkm = km centroids = bestkm.cluster_centers_ return bestkm.cluster_centers_, bestkm.labels_, bestkm.inertia_ def _fit_process_bagirov(self, X): """ Clusters using the global K-means algorithm Bagirov variation :param X: :return: """ # Create a KNN structure for fast search self._neighbors = NearestNeighbors() self._neighbors.fit(X) # Compute the centroid of the dataset centroids = sum(X) / X.shape[0] assignments = [0 for i in range(X.shape[0])] centroids.shape = (1, X.shape[1]) # compute the distance of the examples to the centroids mindist = np.zeros(X.shape[0]) for i in range(X.shape[0]): mindist[i] = \ euclidean_distances(X[i].reshape(1, -1), centroids[assignments[i]].reshape(1, -1), squared=True)[0] for k in range(2, self.n_clusters + 1): newCentroid = self._compute_next_centroid(X, centroids, assignments, mindist) centroids = np.vstack((centroids, newCentroid)) km = KMeans(n_clusters=k, init=centroids, n_init=1) km.fit(X) assignments = km.labels_ for i in range(X.shape[0]): mindist[i] = \ euclidean_distances(X[i].reshape(1, -1), centroids[assignments[i]].reshape(1, -1), squared=True)[0] return km.cluster_centers_, km.labels_, km.inertia_ def _compute_next_centroid(self, X, centroids, assignments, mindist): """ Computes the candidate for the next centroid :param X: :param centroids: :return: """ minsum = np.infty candCentroid = None # Compute the first candidate to new centroid for i in range(X.shape[0]): distance = euclidean_distances(X[i].reshape(1, -1), centroids[assignments[i]].reshape(1, -1))[0] S2 = self._neighbors.radius_neighbors(X[i].reshape(1, -1), radius=distance, return_distance=False)[0] S2centroid = np.sum(X[S2], axis=0) / len(S2) S2centroid.shape = (1, X.shape[1]) cost = self._compute_fk(X, mindist, S2centroid) if cost < minsum: minsum = cost candCentroid = S2centroid # Compute examples for the new centroid S2 = [] newDist = euclidean_distances(X, candCentroid.reshape(1, -1), squared=True) for i in range(X.shape[0]): if newDist[i] < mindist[i]: S2.append(i) newCentroid = sum(X[S2]) / len(S2) newCentroid.shape = (1, X.shape[1]) while not (candCentroid == newCentroid).all(): candCentroid = newCentroid S2 = [] newDist = euclidean_distances(X, candCentroid.reshape(1, -1), squared=True) for i in range(X.shape[0]): if newDist[i] < mindist[i]: S2.append(i) newCentroid = np.sum(X[S2], axis=0) / len(S2) newCentroid.shape = (1, X.shape[1]) return candCentroid def _compute_fk(self, X, mindist, ccentroid): """ Computes the cost function :param X: :param mindist: :param ccentroid: :return: """ # Distances among the examples and the candidate centroid centdist = euclidean_distances(X, ccentroid.reshape(1, -1), squared=True) fk = 0 for i in range(X.shape[0]): fk = fk + min(mindist[i], centdist[i][0]) return fk @staticmethod def _find_nearest_cluster(examp, centers): """ Finds the nearest cluster for an example :param examp: :param centers: :return: """ dist = euclidean_distances(centers, examp.reshape(1, -1)) pmin = np.argmin(dist) vmin = np.min(dist) return pmin, vmin
def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', metric_params=None, algorithm='auto', leaf_size=30, p=2, sample_weight=None, n_jobs=1): """Perform DBSCAN clustering from vector array or distance matrix. Read more in the :ref:`User Guide <dbscan>`. Parameters ---------- X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \ array of shape (n_samples, n_samples) A feature array, or array of distances between samples if ``metric='precomputed'``. eps : float, optional The maximum distance between two samples for them to be considered as in the same neighborhood. min_samples : int, optional The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself. metric : string, or callable The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by :func:`sklearn.metrics.pairwise_distances` for its metric parameter. If metric is "precomputed", X is assumed to be a distance matrix and must be square. X may be a sparse matrix, in which case only "nonzero" elements may be considered neighbors for DBSCAN. metric_params : dict, optional Additional keyword arguments for the metric function. .. versionadded:: 0.19 algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional The algorithm to be used by the NearestNeighbors module to compute pointwise distances and find nearest neighbors. See NearestNeighbors module documentation for details. leaf_size : int, optional (default = 30) Leaf size passed to BallTree or cKDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem. p : float, optional The power of the Minkowski metric to be used to calculate distance between points. sample_weight : array, shape (n_samples,), optional Weight of each sample, such that a sample with a weight of at least ``min_samples`` is by itself a core sample; a sample with negative weight may inhibit its eps-neighbor from being core. Note that weights are absolute, and default to 1. n_jobs : int, optional (default = 1) The number of parallel jobs to run for neighbors search. If ``-1``, then the number of jobs is set to the number of CPU cores. Returns ------- core_samples : array [n_core_samples] Indices of core samples. labels : array [n_samples] Cluster labels for each point. Noisy samples are given the label -1. Notes ----- For an example, see :ref:`examples/cluster/plot_dbscan.py <sphx_glr_auto_examples_cluster_plot_dbscan.py>`. This implementation bulk-computes all neighborhood queries, which increases the memory complexity to O(n.d) where d is the average number of neighbors, while original DBSCAN had memory complexity O(n). Sparse neighborhoods can be precomputed using :func:`NearestNeighbors.radius_neighbors_graph <sklearn.neighbors.NearestNeighbors.radius_neighbors_graph>` with ``mode='distance'``. References ---------- Ester, M., H. P. Kriegel, J. Sander, and X. Xu, "A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise". In: Proceedings of the 2nd International Conference on Knowledge Discovery and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996 """ if not eps > 0.0: raise ValueError("eps must be positive.") X = check_array(X, accept_sparse='csr') if sample_weight is not None: sample_weight = np.asarray(sample_weight) check_consistent_length(X, sample_weight) # Calculate neighborhood for all samples. This leaves the original point # in, which needs to be considered later (i.e. point i is in the # neighborhood of point i. While True, its useless information) if metric == 'precomputed' and sparse.issparse(X): neighborhoods = np.empty(X.shape[0], dtype=object) X.sum_duplicates() # XXX: modifies X's internals in-place X_mask = X.data <= eps masked_indices = X.indices.astype(np.intp, copy=False)[X_mask] masked_indptr = np.concatenate(([0], np.cumsum(X_mask)))[X.indptr[1:]] # insert the diagonal: a point is its own neighbor, but 0 distance # means absence from sparse matrix data masked_indices = np.insert(masked_indices, masked_indptr, np.arange(X.shape[0])) masked_indptr = masked_indptr[:-1] + np.arange(1, X.shape[0]) # split into rows neighborhoods[:] = np.split(masked_indices, masked_indptr) else: neighbors_model = NearestNeighbors(radius=eps, algorithm=algorithm, leaf_size=leaf_size, metric=metric, metric_params=metric_params, p=p, n_jobs=n_jobs) neighbors_model.fit(X) # This has worst case O(n^2) memory complexity neighborhoods = neighbors_model.radius_neighbors(X, eps, return_distance=False) if sample_weight is None: n_neighbors = np.array([len(neighbors) for neighbors in neighborhoods]) else: n_neighbors = np.array([np.sum(sample_weight[neighbors]) for neighbors in neighborhoods]) # Initially, all samples are noise. labels = -np.ones(X.shape[0], dtype=np.intp) # A list of all core samples found. core_samples = np.asarray(n_neighbors >= min_samples, dtype=np.uint8) dbscan_inner(core_samples, neighborhoods, labels) return np.where(core_samples)[0], labels
epxy = np.transpose([rx2,ry2]) if local: #?esta funcion usa el parametro nrefstars? nbrs = NN(n_neighbors=nrefstars, algorithm='auto').fit(epxy) #coords transformadas. Se crea arreglo de ceros ctx = np.zeros(x1.size) cty = np.zeros(y1.size) nne = np.zeros(x1.size) - 1 if rad_ext!=0: print '\nQuieres %d refstars para transformar cada estrella' %(nrefstars-1) dist, nei = nbrs.radius_neighbors(np.transpose([x2,y2]),radius=rad_ext) nbors = np.array([len(d) for d in dist]) #los nei son tantos arreglos con la posicion (indice) de las refstars #como estrellas a analizar #si imprimo nei da el indice de las refstars seleccionadas para todas las estrellas #print 'nei:', nei #print 'nei:', nei.shape #print 'nei:', nei[0] #print 'refstars a distancia < rad_ext (id) \n', rid1[nei[0]] #print 'Numero de refstars encontradas por estrella (r<rad_ext):', nbors #los dist son tantos arreglos con las distancias de las refstars como #estrellas a analizar #?como imprimo las distancias de menor a mayor #print 'Distancias de las refstars por estrella:\n', dist[0]
class SPIDER3(BaseSampler): """ SPIDER3 algorithm implementation for selective preprocessing of multi-class imbalanced data sets. Reference: Wojciechowski, S., Wilk, S., Stefanowski, J.: An Algorithm for Selective Preprocessing of Multi-class Imbalanced Data. Proceedings of the 10th International Conference on Computer Recognition Systems CORES 2017 """ def __init__(self, k, maj_int_min=None, cost=None): """ :param k: Number of nearest neighbors considered while resampling. :param maj_int_min: Dict that contains lists of majority, intermediate and minority classes labels. :param cost: The cost matrix. An element c[i, j] of this matrix represents the cost associated with misclassifying an example from class i as class one from class j. """ super().__init__() self._sampling_type = 'clean-sampling' self.k = k self.neigh_clf = NearestNeighbors(n_neighbors=self.k) self.maj_int_min = maj_int_min self.cost = cost self.AS, self.RS = np.array([]), np.array([]) def _fit_resample(self, X, y): """ Performs resampling :param X: Numpy array of examples that is the subject of resampling. :param y: Numpy array of labels corresponding to examples from X. :return: Resampled X along with accordingly modified labels, resampled y """ self._initialize_algorithm(X, y) self.DS = np.append(X, y.reshape(y.shape[0], 1), axis=1) self._restart_perspective() self._calculate_weak_majority_examples() self._restore_perspective() self.DS = setdiff(self.DS, self.RS) int_classes, min_classes = self._sort_by_cardinality(y) for int_min_class in int_classes + min_classes: self.relabel(int_min_class) self.clean(int_min_class) self.amplify(int_min_class) self.DS = union(self.DS, self.AS) return self.DS[:, :-1], self.DS[:, -1] def _initialize_algorithm(self, X, y): if self.maj_int_min is None: self.maj_int_min = construct_maj_int_min(y) self.majority_classes = self.maj_int_min['maj'] self.intermediate_classes = self.maj_int_min['int'] self.minority_classes = self.maj_int_min['min'] self.stds, self.means = [1] * X.shape[1], [0] * X.shape[1] if self.cost is None: self.cost = self._estimate_cost_matrix(y) @staticmethod def _estimate_cost_matrix(y): """ Method that estimates cost matrix automatically. For example given imbalance ratios of 1:2:6, the estimated matrix will be: [0 1 1 2 0 1 6 3 0] :param y: labels :return: cost matrix """ class_cardinality = Counter(y) classes = list(class_cardinality.keys()) cost = np.ones([len(classes), len(classes)]) for i, (c1, card1) in enumerate(class_cardinality.items()): for j, (c2, card2) in enumerate(class_cardinality.items()): if j > i: cost[i, j] = 1 else: cost[i, j] = card1 / card2 np.fill_diagonal(cost, 0) return cost def _sort_by_cardinality(self, y): class_cardinality = Counter(y) # to ensure looping over classes with decreasing cardinality. int_classes = sorted(self.intermediate_classes, key=lambda clazz: -class_cardinality[clazz]) min_classes = sorted(self.minority_classes, key=lambda clazz: -class_cardinality[clazz]) return int_classes, min_classes def amplify(self, int_min_class): self._restart_perspective() int_min_ds = self.DS[self.DS[:, -1] == int_min_class] for x in int_min_ds: self._amplify_nn(x) self._restore_perspective() def clean(self, int_min_class): self._restart_perspective() int_min_ds = self.DS[self.DS[:, -1] == int_min_class] int_min_as = self._calc_int_min_as(int_min_class) for x in union(int_min_ds, int_min_as): self._clean_nn(x) self._restore_perspective() def relabel(self, int_min_class): self._restart_perspective() int_min_ds = self.DS[self.DS[:, -1] == int_min_class] for x in int_min_ds: self._relabel_nn(x) self._restore_perspective() def _restart_perspective(self): """ Performs normalization over resampled dataset. """ for col in range(self._ds_as_rs_union().shape[1] - 1): self.stds[col] = self._ds_as_rs_union()[:, col].std() self.means[col] = self._ds_as_rs_union()[:, col].mean() for col in range(self._ds_as_rs_union().shape[1] - 1): if self.stds[col] == 0: self.stds[col] = 1e-6 for dataset in [self.DS, self.RS, self.AS]: if dataset.shape[0] > 0: self._normalize(dataset) def _restore_perspective(self): """ Denormalizes for further processing. """ for dataset in [self.DS, self.RS, self.AS]: if dataset.shape[0] > 0: self._denormalize(dataset) def _normalize(self, dataset): for col in range(dataset.shape[1] - 1): dataset[:, col] = (dataset[:, col] - self.means[col]) / (4 * self.stds[col]) def _denormalize(self, dataset): for col in range(dataset.shape[1] - 1): dataset[:, col] = dataset[:, col] * self.stds[col] * 4 + self.means[col] def _calc_int_min_as(self, int_min_class): """ Helper method to calculate examples form AS that belong to int_min_class parameter class. :param int_min_class: The class name (intermediate or minority). :return: Examples from AS that are belong to int_min_class. """ if self.AS.size != 0: int_min_as = self.AS[self.AS[:, -1] == int_min_class] else: int_min_as = np.array([]) return int_min_as def _calculate_weak_majority_examples(self): """ Calculates weak majority examples and appends them to the RS set. """ for majority_class in self.majority_classes: majority_examples = self.DS[self.DS[:, -1] == majority_class] for x in majority_examples: if majority_class not in self._min_cost_classes(x, self.DS): self.RS = union(self.RS, np.array([x])) def _min_cost_classes(self, x, DS): """ Utility function that aims to identify minimum-cost classes, i.e. classes leading to the minimum cost after being (mis)classified as classes appearing in the neighborhood of x. :param x: Single observation :param DS: DS :return: List of classes associated with minimal cost of misclassification. """ C = self.minority_classes + self.intermediate_classes + self.majority_classes vals = [] kneighbors = self._knn(x, DS) for cj in C: s = 0 for ci in C: s += ( (kneighbors[:, -1] == ci).astype(int).sum() / self.k) * self.cost[C.index(ci), C.index(cj)] vals.append(s) C = np.array(C) vals = np.array(vals) vals = np.round(vals, 6) return C[vals == vals[np.argmin(vals)]] def _relabel_nn(self, x): """ Performs relabeling in the nearest neighborhood of x. :param x: An observation. """ nearest_neighbors = self._knn(x, self._ds_as_rs_union()) for neighbor in nearest_neighbors: if contains(self.RS, neighbor) and self._class_of( neighbor) in self.majority_classes and self._class_of( neighbor) in self._min_cost_classes( x, self._ds_as_rs_union()): self.RS = setdiff(self.RS, np.array([neighbor])) neighbor[-1] = x[-1] self.AS = union(self.AS, np.array([neighbor])) def _clean_nn(self, x): """ Performs cleaning in the nearest neighborhood of x. :param x: Single observation. """ nearest_neighbors = self._knn(x, self._ds_as_rs_union()) for neighbor in nearest_neighbors: if self._class_of(neighbor) in self.majority_classes and \ self._class_of(neighbor) in self._min_cost_classes(x, self._ds_as_rs_union()): self.DS = setdiff(self.DS, np.array([neighbor])) self.RS = setdiff(self.RS, np.array([neighbor])) def _knn(self, x, DS): """ Returns k nearest neighbors of x in DS that belong to c class if specified. :param x: Single observation :param DS: DS :param c: Class of neighbors that should be returned. :return: These neighbors from k nearest that belong to class c if specified. Otherwise all of them. """ DS = setdiff(DS, np.array([x])) if DS.shape[0] < self.k: self.neigh_clf = NearestNeighbors(n_neighbors=DS.shape[0]) else: self.neigh_clf = NearestNeighbors(n_neighbors=self.k) self.neigh_clf.fit(DS[:, :-1]) within_radius = self.neigh_clf.radius_neighbors( [x[:-1]], radius=self.neigh_clf.kneighbors([x[:-1]], return_distance=True)[0][0][-1] + 0.0001 * self.neigh_clf.kneighbors([x[:-1]], return_distance=True)[0][0][-1], return_distance=True) unique_distances = np.unique(sorted(within_radius[0][0])) all_distances = within_radius[0][0] all_indices = within_radius[1][0] indices = [] for dist in unique_distances: if len(indices) < self.k: indices += (all_indices[all_distances == dist]).tolist() return DS[indices] def _amplify_nn(self, x): """ Artificially amplifies example x by adding a copy of it to the AS. :param x: Single observation. """ while self._class_of(x) not in self._min_cost_classes( x, self._ds_as_rs_union()): y = x.copy() self.AS = union(self.AS, np.asarray([y])) @staticmethod def _class_of(example): return example[-1] def _ds_as_rs_union(self): return union(self.DS, union(self.AS, self.RS))