def knn_search(K, x, y=None, min_dist=None, index=None, algorithm=None, return_indices=False, dist_double=False, **kwargs): ''' Calculates Euclidean distances to the first K closest elements of y for each x, which are row-instance data matrices. Returns a matrix whose (i, j)th element is the distance from the ith point in x to the (j+1)th nearest neighbor in y. If return_indices, also returns a matrix whose (i, j)th element is the identity of the (j+1)th nearest neighbor in y to the ith point in x. FLANN returns squared Euclidean distances of the same type as the input data. If dist_double, if results are float32, cast them to float64 before square-rooting for more numerical accuracy. By default, clamps minimum distance to min(1e-2, 1e-100 ** (1/dim)); setting min_dist to a number changes this value. Use 0 for no clamping. If index is passed, uses a preconstructed FLANN index for the elements of y (a FLANN() instance where build_index() has been run). Otherwise, constructs an index here and then deletes it, using the passed algorithm. By default, uses a single k-d tree for data with dimension 5 or lower, and brute-force search in higher dimensions (which give exact results). Any other keyword arguments are also passed to the FLANN() object. ''' N, dim = x.shape if y is not None: M, dim2 = y.shape if dim != dim2: raise TypeError("x and y must have same second dimension") if not is_integer(K) or K < 1: raise TypeError("K must be a positive integer") if index is None: if algorithm is None: algorithm = pick_flann_algorithm(dim) index = FLANNIndex(algorithm=algorithm, **kwargs) index.build_index(x if y is None else x) idx, dist = index.nn_index(x, K) if return_indices: idx = idx.astype(np.uint16) if dist_double: dist = np.asarray(dist, dtype=np.float64) np.sqrt(dist, out=dist) # protect against identical points if min_dist is None: min_dist = default_min_dist(dim) if min_dist > 0: np.maximum(min_dist, dist, out=dist) return (dist, idx) if return_indices else dist
def _build_indices(X, flann_args): "Builds FLANN indices for each bag." # TODO: should probably multithread this logger.info("Building indices...") indices = [None] * len(X) for i, bag in enumerate(plog(X, name="index building")): indices[i] = idx = FLANNIndex(**flann_args) idx.build_index(bag) return indices
def pick_kmeans(x, n): # NOTE: doesn't make sense to do this iteratively # run k-means clustering from vlfeat import vl_kmeans centers = vl_kmeans(x, num_centers=n) # pick points closest to the cluster centers from cyflann import FLANNIndex picked = FLANNIndex().nn(x, centers, num_neighbors=1)[0] N = x.shape[0] return picked, N**2 - (N - n)**2
def _make_index(bag): idx = FLANNIndex(**self.flann_args) idx.build_index(bag) return idx