def run_single_trial(self, train_pairs, test_pairs, train_tune_data, test_tune_data): print "Running PCA..." train_pairs_pca, test_pairs_pca = self.fit_pca(train_pairs, test_pairs) ys = ys_from_pairs(train_pairs_pca) file_id = str(random.random())[2:] save_cvx_params(ys, file_id) run_cvx(file_id) M = load_cvx_result(file_id) dist = DistanceMetric.get_metric('mahalanobis', VI = M) train_a_sections = [x[0] for x in train_pairs_pca] train_b_sections = [x[1] for x in train_pairs_pca] test_a_sections = [x[0] for x in test_pairs_pca] test_b_sections = [x[1] for x in test_pairs_pca] train_given_sections = train_a_sections train_to_match_sections = train_b_sections test_given_sections = test_a_sections test_to_match_sections = test_b_sections if self.match_a_to_b: train_given_sections = train_b_sections train_to_match_sections = train_a_sections test_given_sections = test_b_sections test_to_match_sections = test_a_sections print "Constructing BallTrees..." train_bt = BallTree(train_to_match_sections, metric=dist) test_bt = BallTree(test_to_match_sections, metric=dist) train_top_fraction = int(len(train_given_sections) * self.correct_within_top_fraction) test_top_fraction = int(len(test_given_sections) * self.correct_within_top_fraction) print "Querying the BallTrees..." train_result = train_bt.query(train_given_sections, train_top_fraction) test_result = test_bt.query(test_given_sections, test_top_fraction) print "Looking at correctness of results..." train_correct = sum([int(i in train_result[1][i]) for i in xrange(len(train_given_sections))]) test_correct = sum([int(i in test_result[1][i]) for i in xrange(len(test_given_sections))]) print "Finding indices of correct matches..." test_result_full = test_bt.query(test_given_sections, len(test_given_sections)) def default_index(lst, i): ind = -1 try: ind = lst.index(i) except: pass return ind test_indices = [default_index(list(test_result_full[1][i]), i) for i in xrange(len(test_given_sections))] test_indices = [x for x in test_indices if x != -1] with open("successful_tunes_{}".format(file_id), 'w') as successful_tunes_f: for i, index in enumerate(test_indices): if index == 0: successful_tunes_f.write(str(test_tune_data[i]) + '\n\n') return [[train_correct, len(train_given_sections)], [test_correct, len(test_given_sections)]], test_indices
def _compute_nearest(xhs, rr, use_balltree=True, return_dists=False): """Find nearest neighbors Note: The rows in xhs and rr must all be unit-length vectors, otherwise the result will be incorrect. Parameters ---------- xhs : array, shape=(n_samples, n_dim) Points of data set. rr : array, shape=(n_query, n_dim) Points to find nearest neighbors for. use_balltree : bool Use fast BallTree based search from scikit-learn. If scikit-learn is not installed it will fall back to the slow brute force search. return_dists : bool If True, return associated distances. Returns ------- nearest : array, shape=(n_query,) Index of nearest neighbor in xhs for every point in rr. distances : array, shape=(n_query,) The distances. Only returned if return_dists is True. """ if use_balltree: try: from sklearn.neighbors import BallTree except ImportError: logger.info('Nearest-neighbor searches will be significantly ' 'faster if scikit-learn is installed.') use_balltree = False if xhs.size == 0 or rr.size == 0: if return_dists: return np.array([], int), np.array([]) return np.array([], int) if use_balltree is True: ball_tree = BallTree(xhs) if return_dists: out = ball_tree.query(rr, k=1, return_distance=True) return out[1][:, 0], out[0][:, 0] else: nearest = ball_tree.query(rr, k=1, return_distance=False)[:, 0] return nearest else: from scipy.spatial.distance import cdist if return_dists: nearest = list() dists = list() for r in rr: d = cdist(r[np.newaxis, :], xhs) idx = np.argmin(d) nearest.append(idx) dists.append(d[0, idx]) return (np.array(nearest), np.array(dists)) else: nearest = np.array([np.argmin(cdist(r[np.newaxis, :], xhs)) for r in rr]) return nearest
def knn(a, b): "k nearest neighbors" b = np.array([bb[:-1] for bb in b]) tree = BallTree(b) __, indx = tree.query(a[:-1], k) return [b[i] for i in indx]
def compute_labels(X, C): """Compute the cluster labels for dataset X given centers C. """ # labels = np.argmin(pairwise_distances(C, X), axis=0) # THIS REQUIRES TOO MUCH MEMORY FOR LARGE X tree = BallTree(C) labels = tree.query(X, k=1, return_distance=False).squeeze() return labels
def _hdbscan_prims_balltree(X, min_samples=5, alpha=1.0, metric='minkowski', p=2, leaf_size=40, gen_min_span_tree=False): if metric == 'minkowski': if p is None: raise TypeError('Minkowski metric given but no p value supplied!') if p < 0: raise ValueError('Minkowski metric with negative p value is not defined!') elif p is None: p = 2 # Unused, but needs to be integer; assume euclidean dim = X.shape[0] min_samples = min(dim - 1, min_samples) tree = BallTree(X, metric=metric, leaf_size=leaf_size) dist_metric = DistanceMetric.get_metric(metric) core_distances = tree.query(X, k=min_samples, dualtree=True, breadth_first=True)[0][:, -1] min_spanning_tree = mst_linkage_core_cdist(X, core_distances, dist_metric, alpha) min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :] single_linkage_tree = label(min_spanning_tree) return single_linkage_tree, None
def _hdbscan_prims_balltree(X, min_samples=5, alpha=1.0, metric='minkowski', p=2, leaf_size=40, gen_min_span_tree=False): if metric == 'minkowski': if p is None: raise TypeError('Minkowski metric given but no p value supplied!') if p < 0: raise ValueError('Minkowski metric with negative p value is not defined!') elif p is None: p = 2 # Unused, but needs to be integer; assume euclidean size = X.shape[0] min_samples = min(size - 1, min_samples) tree = BallTree(X, metric=metric, leaf_size=leaf_size) dist_metric = DistanceMetric.get_metric(metric) #Get distance to kth nearest neighbour core_distances = tree.query(X, k=min_samples, dualtree=True, breadth_first=True)[0][:, -1] #Mutual reachability distance is implicite in mst_linkage_core_cdist min_spanning_tree = mst_linkage_core_cdist(X, core_distances, dist_metric, alpha) #Sort edges of the min_spanning_tree by weight min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :] #Convert edge list into standard hierarchical clustering format single_linkage_tree = label(min_spanning_tree) return single_linkage_tree, None
def DualTree(dataFlux, dDataFlux, modelFlux, modelParams, mcIts): """ Inputs: dataFlux = observed fluxes, array of size (#objects,#filters) dDataFlux = flux uncertainties, array of size (#objects,#filters) modelFlux = fluxes of models, array of size (#models,#filters) modelParams = parameters of each model to be recorded, array of size (#models,#parameters) mcIts = number of times to perturb fluxes for each object, int Output: NumPy array of size (#objects,mcIts,#params) e.g. the zeroth element gives you a 2d array where each row represents the fit parameters from one monte carlo iteration """ modelColors = modelFlux[:, 1:] / modelFlux[:, :-1] tree = BallTree(modelColors) fitParams = [] for i in range(len(dataFlux)): newFlux = dataFlux[i] + dDataFlux[i] * np.random.randn(mcIts, len(dataFlux[i])) newColors = newFlux[:, 1:] / newFlux[:, :-1] query = tree.query(newColors, k=1, dualtree=True) s = Scale(modelFlux[query[1][:, 0]], newFlux, np.ones(np.shape(newFlux))) myParams = s for j in range(len(modelParams[0])): myParams = np.c_[myParams, modelParams[query[1][:, 0]][:, j]] fitParams.append(myParams) return np.array(fitParams)
class BallTreeRecommender(object): """ Given input terms, provide k recipe recommendations """ def __init__(self, k=3, **kwargs): self.k = k self.trans_path = "svd.pkl" self.tree_path = "tree.pkl" self.transformer = False self.tree = None self.load() def load(self): """ Load a pickled transformer and tree from disk, if they exist. """ if os.path.exists(self.trans_path): self.transformer = joblib.load(open(self.trans_path, 'rb')) self.tree = joblib.load(open(self.tree_path, 'rb')) else: self.transformer = False self.tree = None def save(self): """ It takes a long time to fit, so just do it once! """ joblib.dump(self.transformer, open(self.trans_path, 'wb')) joblib.dump(self.tree, open(self.tree_path, 'wb')) def fit_transform(self, documents): # Transformer will be False if pipeline hasn't been fit yet, # Trigger fit_transform and save the transformer and lexicon. if self.transformer == False: self.transformer = Pipeline([ ('norm', TextNormalizer(minimum=50, maximum=200)), ('transform', Pipeline([ ('tfidf', TfidfVectorizer()), ('svd', TruncatedSVD(n_components=200)) ]) ) ]) self.lexicon = self.transformer.fit_transform(documents) self.tree = BallTree(self.lexicon) self.save() def query(self, terms): """ Given input list of ingredient terms, return the k closest matching recipes. :param terms: list of strings :return: list of document indices of documents """ vect_doc = self.transformer.named_steps['transform'].fit_transform( wordpunct_tokenize(terms) ) dists, inds = self.tree.query(vect_doc, k=self.k) return inds[0]
def correrPruebaLocal(set_ampliado): print "corriendo prueba local" train,targetTrain,test,targetTest = cargarDatosPruebaLocal(set_ampliado,0.66) tree = BallTree(train,leaf_size=30) predictions=[] correctas=0 incorrectas=0 for x in range(len(test)): dist, ind = tree.query(test[x], k=4) resultado = obtenerPrediccionknnEB(train,targetTrain,test[x],ind.ravel()) predictions.append(resultado) print progreso(x,len(test)) if resultado==targetTest[x]: correctas+=1 else: incorrectas+=1 print "Predicciones --> Correctas: " + str(correctas) + "Incorrectas: " + str(incorrectas)+ "Total: "+ str(len(test)) print('> predicted=' + repr(resultado) + ', actual=' + repr(targetTest[x]) + ' ' + progreso(x,len(test)) ) print "precision total" correct = 0 for x in range(len(test)): if targetTest[x] == predictions[x]: correct += 1 print (float(correct)/float(len(test))) * 100.0
def DualTree(dataFlux,dDataFlux,modelFlux,modelParams,mcIts,columnsToScale=[]): ''' Inputs: dataFlux = observed fluxes, array of size (#objects,#filters) dDataFlux = flux uncertainties, array of size (#objects,#filters) modelFlux = fluxes of models, array of size (#models,#filters) modelParams = parameters of each model to be recorded, array of size (#models,#parameters) mcIts = number of times to perturb fluxes for each object, int columnsToScale = list of column indices in modelParams of parameters that need to be multiplied by scale factor Output: NumPy array of size (#objects,mcIts,#params) e.g. the zeroth element gives you a 2d array where each row represents the fit parameters from one monte carlo iteration ''' modelColors = modelFlux[:,1:] / modelFlux[:,:-1] tree = BallTree(modelColors) fitParams = [] for i in range(len(dataFlux)): newFlux = dataFlux[i] + dDataFlux[i] * np.random.randn(mcIts,len(dataFlux[i])) newColors = newFlux[:,1:] / newFlux[:,:-1] query = tree.query(newColors,k=1,dualtree=True) s = fit_tools.Scale(modelFlux[query[1][:,0]],newFlux,np.ones(np.shape(newFlux))) myParams = s for j in range(len(modelParams[0])): if j in columnsToScale: myParams = np.c_[myParams,np.multiply(s,modelParams[query[1][:,0]][:,j])] else: myParams = np.c_[myParams,modelParams[query[1][:,0]][:,j]] fitParams.append(myParams) return(np.array(fitParams))
def test_barnes_hut_angle(): # When Barnes-Hut's angle=0 this corresponds to the exact method. angle = 0.0 perplexity = 10 n_samples = 100 for n_components in [2, 3]: n_features = 5 degrees_of_freedom = float(n_components - 1.0) random_state = check_random_state(0) distances = random_state.randn(n_samples, n_features) distances = distances.astype(np.float32) distances = distances.dot(distances.T) np.fill_diagonal(distances, 0.0) params = random_state.randn(n_samples, n_components) P = _joint_probabilities(distances, perplexity, False) kl, gradex = _kl_divergence(params, P, degrees_of_freedom, n_samples, n_components) k = n_samples - 1 bt = BallTree(distances) distances_nn, neighbors_nn = bt.query(distances, k=k + 1) neighbors_nn = neighbors_nn[:, 1:] Pbh = _joint_probabilities_nn(distances, neighbors_nn, perplexity, False) kl, gradbh = _kl_divergence_bh(params, Pbh, neighbors_nn, degrees_of_freedom, n_samples, n_components, angle=angle, skip_num_points=0, verbose=False) assert_array_almost_equal(Pbh, P, decimal=5) assert_array_almost_equal(gradex, gradbh, decimal=5)
def md_nearest_from_centroids(seeding, centroids): # mean distance ball_tree = BallTree(seeding) dist, idx = ball_tree.query(centroids) sum_dist = sum(d[0] for d in dist) mean = sum_dist / len(centroids) return mean
class BallTreeANN: def __init__(self): """ Constructor """ self.nbrs = None def build_index(self, dataset, leaf_size): self.nbrs = BallTree(dataset, leaf_size=leaf_size, metric="euclidean") return self.nbrs def build_store_index(self, dataset, path, leaf_size): self.build_index(dataset, leaf_size) self.store_index(path) def store_index(self, path): with open(path, "wb") as output1: pickle.dump(self.nbrs, output1, pickle.HIGHEST_PROTOCOL) def load_index(self, path): with open(path, "rb") as input1: self.nbrs = pickle.load(input1) def search_in_radious(self, vector, radious=2): distances, indices = self.nbrs.query_radius(vector, r=radious, return_distance=True) return distances, indices def search_neighbors(self, vector, num_neighbors): distances, indices = self.nbrs.query(vector, k=num_neighbors) return distances, indices
def _rsl_prims_balltree(X, cut, k=5, alpha=1.4142135623730951, gamma=5, metric='minkowski', p=2): if metric == 'minkowski': if p is None: raise TypeError('Minkowski metric given but no p value supplied!') if p < 0: raise ValueError('Minkowski metric with negative p value is not defined!') elif p is None: p = 2 # Unused, but needs to be integer; assume euclidean dim = X.shape[0] k = min(dim - 1, k) tree = BallTree(X, metric=metric) dist_metric = DistanceMetric.get_metric(metric) core_distances = tree.query(X, k=k)[0][:,-1] min_spanning_tree = mst_linkage_core_cdist(X, core_distances, dist_metric) single_linkage_tree = label(min_spanning_tree) single_linkage_tree = SingleLinkageTree(single_linkage_tree) labels = single_linkage_tree.get_clusters(cut, gamma) return labels, single_linkage_tree
def _hdbscan_prims_balltree(X, min_samples=5, alpha=1.0, metric='minkowski', p=2, leaf_size=40, gen_min_span_tree=False, **kwargs): if metric == 'minkowski': if p is None: raise TypeError('Minkowski metric given but no p value supplied!') if p < 0: raise ValueError('Minkowski metric with negative p value is not defined!') elif p is None: p = 2 # Unused, but needs to be integer; assume euclidean # The Cython routines used require contiguous arrays if not X.flags['C_CONTIGUOUS']: X = np.array(X, dtype=np.double, order='C') size = X.shape[0] min_samples = min(size - 1, min_samples) tree = BallTree(X, metric=metric, leaf_size=leaf_size, **kwargs) dist_metric = DistanceMetric.get_metric(metric, **kwargs) # Get distance to kth nearest neighbour core_distances = tree.query(X, k=min_samples, dualtree=True, breadth_first=True)[0][:, -1].copy(order='C') # Mutual reachability distance is implicit in mst_linkage_core_vector min_spanning_tree = mst_linkage_core_vector(X, core_distances, dist_metric, alpha) # Sort edges of the min_spanning_tree by weight min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :] # Convert edge list into standard hierarchical clustering format single_linkage_tree = label(min_spanning_tree) return single_linkage_tree, None
class ColorSchemeTransformer(object): def __init__(self, color_palette): self.color_mapper = BallTree(color_palette) def nearest_neighbors(self, image): flat_image = image.reshape((image.shape[0] * image.shape[1], 3)) matched_colors = self.color_mapper.query(flat_image)[1] return matched_colors.reshape((image.shape[0], image.shape[1]))
class HammingBallTree(HammingSearchBase): def __init__(self, leaf_size=40, query_kwds=None): self.leaf_size = leaf_size self.query_kwds = query_kwds or {} def fit(self, X): X = self._validate_input(X, return_compact=False) self._tree = BallTree(X, metric='hamming', leaf_size=self.leaf_size) return self def query(self, X, k, return_dist=False): X = self._validate_input(X, return_compact=False) if return_dist: dist, ind = self._tree.query(X, k, return_distance=True) return ind, (dist * X.shape[1]).astype(int) else: return self._tree.query(X, k, return_distance=False)
def rmsd_nearest_from_centroids(seeding, centroids): # root mean squared distance from each centroids to its closest seeding ball_tree = BallTree(seeding) dist, idx = ball_tree.query(centroids) # root mean squared distance sum_sqdist = sum(d[0] ** 2 for d in dist) mean = sum_sqdist / len(centroids) return mean ** 0.5
def predict(self, X): ball_tree = BallTree() ball_tree.fit(self.cluster_centers_) _, indexes = ball_tree.query(X) result = [] for idx, in indexes: result.append(self.labels_[idx]) return result
def md_weighted_nearest_from_centroids(seeding, centroids, weights): assert len(centroids) == len(weights) sum_weight = sum(weights) ball_tree = BallTree(seeding) dist, idx = ball_tree.query(centroids) sum_weighted_dist = sum(d[0] * weight for d, weight in zip(dist, weights)) mean = sum_weighted_dist / sum_weight return mean
def get_centroid_weights(X, centroids): assert isinstance(X, np.ndarray) assert isinstance(centroids, np.ndarray) ball_tree = BallTree(centroids) dist, indexes = ball_tree.query(X) weights = [0 for i in centroids] for idx in indexes: weights[idx] += 1 return weights
def get_graph_topo(halos): x,y,z = cosmology.spherical_to_cartesian_with_redshift(halos['ra'],halos['dec'],halos['z']) box_coords = np.concatenate( [x,y,z] , axis=1) BT = BallTree(box_coords, leaf_size=5) list_conn = [] for ih,vh in enumerate(halos): n_connections=70 bt_dx,bt_id = BT.query(box_coords[ih,:],k=n_connections) for ic,vc in enumerate(halos[bt_id]): pass
def correrPruebaParaKaggle(set_ampliado): print "corriendo pruebas para kaggle" train,targetTrain,test = cargarDatosParaKaggle(set_ampliado) tree = BallTree(train,leaf_size=30) predictions=[] for x in range(len(test)): dist, ind = tree.query(test[x], k=4) resultado = obtenerPrediccionknnEB(train,targetTrain,test[x],ind.ravel()) predictions.append(resultado) print progreso(x,len(test)) guardarPrediccionesParaKaggle(predictions)
def build_knn_matrix(data_matrix): neighbours_matrix = np.zeros((voxel_num,K_NN-1)) tree = BallTree(data_matrix[:,0:3]) for voxel in range(voxel_num): dist,ind = tree.query(data_matrix[voxel,0:3],k = K_NN) neighbours_matrix[voxel,:] = ind[0,1:] for cur_voxel in range(voxel_num): neighbours = neighbours_matrix[cur_voxel,:] for ind in range(len(neighbours)): neighbour = int(neighbours[ind]) if(cur_voxel not in neighbours_matrix[neighbour,:]): neighbours_matrix[cur_voxel,ind] = -1 return neighbours_matrix
def get_evidence_grid(points, res_pts, intr_prms, exact=False): """ Associate the "z-axis" value (evidence, overlap, etc...) res_pts with its corresponding point in the template bank (points). If exact is True, then the poit must exactly match the point in the bank. """ grid_tree = BallTree(selected) grid_idx = [] # Reorder the grid points to match their weight indices for res in res_pts: dist, idx = grid_tree.query(res, k=1) # Stupid floating point inexactitude... #print res, selected[idx[0][0]] #assert numpy.allclose(res, selected[idx[0][0]]) grid_idx.append(idx[0][0]) return points[grid_idx]
def calc_vert_vals(verts, pts, vals, method='max', k_points=100): ball_tree = BallTree(pts) dists, pts_inds = ball_tree.query(verts, k=k_points, return_distance=True) near_vals = vals[pts_inds] # sig_dists = dists[np.where(abs(near_vals)>2)] cover = len(np.unique(pts_inds.ravel()))/float(len(pts)) print('{}% of the points are covered'.format(cover*100)) if method=='dist': n_dists = 1/(dists**2) norm = 1/np.sum(n_dists, 1) norm = np.reshape(norm, (len(norm), 1)) n_dists = norm * n_dists verts_vals = np.sum(near_vals * n_dists, 1) elif method=='max': verts_vals = near_vals[range(near_vals.shape[0]), np.argmax(abs(near_vals), 1)] return verts_vals
def voronoid_filling(seeding, centroids, weights): assert len(centroids) == len(weights) ball_tree = BallTree(centroids) _, indexes = ball_tree.query(seeding) filled_centroids = set() sum_weights = sum(weights) badness = sum_weights for idx, in indexes: if idx in filled_centroids: continue filled_centroids.add(idx) badness -= weights[idx] return badness / sum_weights
def cvt_bins(rkpc,zkpc,flux,fluxerror,bin_ids,niters = 4,bad_bin_id = None): combrzarr = (np.vstack((rkpc,zkpc))).T weights = (flux/fluxerror)**2 weighted_rkpc = rkpc*weights weighted_zkpc = zkpc*weights old_rcentroids = None old_zcentroids = None unique_bins = np.unique(bin_ids) if bad_bin_id != None: unique_bins = unique_bins[unique_bins != bad_bin_id] max_distance = -1 for i in range(niters): start = time.time() #Compute centroids: meanweights = scipy.ndimage.mean(weights,labels=bin_ids,index=unique_bins) numperbin = scipy.ndimage.sum(np.ones(len(weights)),labels=bin_ids,index=unique_bins) mean_weighted_rkpc = scipy.ndimage.mean(weighted_rkpc,labels=bin_ids,index=unique_bins) mean_weighted_zkpc = scipy.ndimage.mean(weighted_zkpc,labels=bin_ids,index=unique_bins) #print numperbin.min(),numperbin.max() #np.savetxt('test_weights.txt',zip(meanweights,numperbin)) #print "Debug: ",meanweights.min(),meanweights.max() rcentroids = (mean_weighted_rkpc/meanweights)[numperbin > 0] zcentroids = (mean_weighted_zkpc/meanweights)[numperbin > 0] #Compute which bin each pixel belongs in: combrzarr_centroids = (np.vstack((rcentroids,zcentroids))).T tree = BallTree(combrzarr_centroids,leaf_size=10,metric='euclidean') dists,indices = tree.query(combrzarr,k=1) indices = indices[:,0] bin_ids = unique_bins[indices] print "CVT iteration {0:d}: Number of bins = {1:d}, Time = {2:.2f}s".format(i,len(rcentroids),time.time()-start) # if old_rcentroids != None: # old_rcentroids = old_rcentroids[numperbin > 0] # old_zcentroids = old_zcentroids[numperbin > 0] # distances = np.sqrt((rcentroids-old_rcentroids)**2+(zcentroids-old_zcentroids)**2) # max_distance = distances.max() # print "CVT iteration {0:d}: Number of bins = {1:d}, Max Centroid Change = {2:.3e} kpc, Time = {3:.2f}s".format(i,len(rcentroids),max_distance,time.time()-start) # old_rcentroids = rcentroids # old_zcentroids = zcentroids return bin_ids,max_distance
def fix_bins(rkpc,zkpc,bin_ids): #Get the centroids of all the successful bins: good_bins = np.unique(bin_ids[bin_ids >= 0]) bad_rkpc = rkpc[bin_ids < 0] bad_zkpc = zkpc[bin_ids < 0] comb_badrzarr = (np.vstack((bad_rkpc,bad_zkpc))).T rcentroid = scipy.ndimage.mean(rkpc,labels=bin_ids,index=good_bins) zcentroid = scipy.ndimage.mean(zkpc,labels=bin_ids,index=good_bins) #Ball-tree the centroids: combrzarr = (np.vstack((rcentroid,zcentroid))).T tree = BallTree(combrzarr,leaf_size=5,metric='euclidean') #Query all the bad pixels for their nearest centroid: dists,indices = tree.query(comb_badrzarr,k=1) indices = indices[:,0] bin_ids[bin_ids < 0] = good_bins[indices] return bin_ids
def get_nearest(src_points, candidates, k_neighbors=1): # https://autogis-site.readthedocs.io/en/latest/notebooks/L3/06_nearest-neighbor-faster.html """Find nearest neighbors for all source points from a set of candidate points""" # Create tree from the candidate points tree = BallTree(candidates, leaf_size=15, metric='haversine') # Find closest points and distances distances, indices = tree.query(src_points, k=k_neighbors) # Transpose to get distances and indices into arrays distances = distances.transpose() indices = indices.transpose() # Get closest indices and distances (i.e. array at index 0) # note: for the second closest points, you would take index 1, etc. closest = indices[0] closest_dist = distances[0] # Return indices and distances return (closest, closest_dist)
def do_novelty_generation( num_cpu=multiprocessing.cpu_count(), min_distance=1.): gateway.nextGeneration() chromosomes = gateway.listNewChromosomes() machines = [gateway.executeProgram(c.getProgram()) for c in chromosomes] _floats = [list(m.floats().toList()) for m in machines] behaviors = Parallel(n_jobs=num_cpu)(delayed(evaluate_behavior)(_f) for _f in _floats) ball = BallTree(behaviors_archive, metric='euclidean') _indexes = [] for i, candidate in enumerate(behaviors): dist, _ = ball.query([candidate], k=1) if dist[0][0] > min_distance: behaviors_archive.append(candidate.tolist()) chromosomes[i].setFitness(dist[0][0]) _indexes.append(i) score = evaluate_fitness(_floats[i]) if len(history) < 1 or history[-1][0] < score: history.append((score, chromosomes[i].toCodeString())) print(history[-1]) print("Added {} new behaviors.".format(len(_indexes)))
def get_nearest_neighbors(src_vectors, tgt_vectors, num_neighbors=5): if not BallTree: raise ImportError('The scikit-learn package must be installed') src_latlng = np.radians(np.array(src_vectors)) tgt_latlng = np.radians(np.array(tgt_vectors)) tree = BallTree(tgt_latlng, metric='haversine') di_tuple = tree.query(src_latlng, k=num_neighbors) ix_dist_array = [] for i in range(len(src_vectors)): nn_index_and_dist = [] for j in range(num_neighbors): index = di_tuple[1][i][j] distance = di_tuple[0][i][j] * 6367 * 1000 nn_index_and_dist.append((index, distance)) ix_dist_array.append(nn_index_and_dist) return ix_dist_array
def get_color_and_labels( original_vertices: np.ndarray, representative_vertices: np.ndarray) -> List[np.ndarray]: """find nearest neighbor in Euclidean space to interpolate color and label information to vertices in simplified mesh. Arguments: original_vertices {np.ndarray} -- vertex positions in original mesh representative_vertices {np.ndarray} -- vertex positions in simplified mesh Returns: List[np.ndarray] -- list of arrays containing RGB color and label information """ ball_tree = BallTree(original_vertices[:, :3]) return_colors_labels = [] for coords in representative_vertices: _, ind = ball_tree.query(coords, k=1) return_colors_labels.append(original_vertices[ind.flatten()][:, 3:]) return return_colors_labels
class NeighborSampler(BaseEstimator): def __init__(self, k=8, temperature=1.2): self.k = k self.temperature = temperature def fit(self, X, y): self.tree_ = BallTree(X) self.y_ = np.array(y) def predict(self, X, random_state=None): distances, indeces = self.tree_.query(X, return_distance=True, k=self.k) result = [] dist = [] for distance, index in zip(distances, indeces): result.append( np.random.choice(index, p=softmax(distance * self.temperature))) dist.append(distance) return self.y_[result], dist
def make_nearest_surf(center, radius, rotation, contour_pts, psize=20, qsize=8, vis=False, seg=None): points = np.array([[ radius[0] * math.cos(u) * math.cos(v), radius[1] * math.cos(v) * math.sin(u), radius[2] * math.sin(v) ] for u in np.linspace(0, 2 * math.pi, num=psize) for v in np.linspace( -math.pi / 2 + 0.01, math.pi / 2 - 0.01, num=psize)]) for i in range(len(points)): points[i] = np.dot(points[i], rotation) points += center tree = BallTree(contour_pts) _, ind = tree.query(points, k=1) ind = np.reshape(ind, (ind.shape[0])) points = contour_pts[ind, :].astype(np.float64) noise = 0.001 points += np.random.rand(points.shape[0], points.shape[1]) * noise if vis: img_mask = get_image_mask_points(seg, points) color_img = draw_segmentation(seg, img_mask, mark_val=255) show_ct_image(color_img) return approximate_surface(points.tolist(), psize, psize, 3, 3, ctrlpts_size_u=qsize, ctrlpts_size_v=qsize)
def _rsl_prims_balltree(X, k=5, alpha=1.4142135623730951, metric='euclidean', **kwargs): # The Cython routines used require contiguous arrays if not X.flags['C_CONTIGUOUS']: X = np.array(X, dtype=np.double, order='C') dim = X.shape[0] k = min(dim - 1, k) tree = BallTree(X, metric=metric, **kwargs) dist_metric = DistanceMetric.get_metric(metric, **kwargs) core_distances = tree.query(X, k=k)[0][:, -1].copy(order='C') min_spanning_tree = mst_linkage_core_vector(X, core_distances, dist_metric, alpha) single_linkage_tree = label(min_spanning_tree) single_linkage_tree = SingleLinkageTree(single_linkage_tree) return single_linkage_tree
def train(self): R = (self.m**2 + self.n**2)**0.5 for it in range(self.max_iter): for sample in self.data: win_m, win_n, max_sim = self.find_winner(sample) neighbor = self.find_neighbor(win_m, win_n, R) for w in neighbor: mw = w[0] nw = w[1] rw = w[2] self.weights[mw, nw] += self.learning_rate( it, rw) * (sample - self.weights[mw, nw]) R *= 1 - (it + 1) / self.max_iter data_tree = BallTree(self.data) for mi in range(self.m): for ni in range(self.n): dist, idx = data_tree.query([self.weights[mi, ni]], k=10) vote = [self.labels[i] for i in idx.reshape(-1)] self.output[mi, ni] = int( sorted(dict(Counter(vote)).items(), key=lambda d: d[1], reverse=True)[0][0])
def test_index(): xs = rand(1000, 100, random_state=42).toarray() try: indexer = SQLiteIndexer(index_path=INDEX_PATH) index = PrioritizedDynamicContinuousIndex(indexer, composite_indices=2, simple_indices=50) index.fit(xs) x = xs[0:1] k = 10 nn_baseline = BallTree(xs) baseline_dist, baseline_idx = nn_baseline.query(x, k=k) dist, idx = index.query(x, k=k) # np.testing.assert_equal(baseline_idx[0], idx) finally: if os.path.exists(INDEX_PATH): os.remove(INDEX_PATH)
def get_nearest(src_points, candidates, k_neighbors=1): """Find nearest neighbors for all source points from a set of candidate points""" # Create tree from the candidate points tree = BallTree(candidates, leaf_size=15, metric='haversine') # Find closest points and distances distances, indices = tree.query(src_points, k=k_neighbors) # Transpose to get distances and indices into arrays distances = distances.transpose() indices = indices.transpose() # Get closest indices and distances (i.e. array at index 0) # note: for the second closest points, you would take index 1, etc. # closest = indices[0] # closest_dist = distances[0] closest = indices closest_dist = distances # Return indices and distances return (closest, closest_dist)
def nearestNeighbors(): print('Nearest Neighbors') start_time = time.time() reader = csv.DictReader(open(data_dir + 'train-tuned-disc.csv')) fieldNames = reader.fieldnames allData = [] testReader = csv.DictReader(open(data_dir + 'ttd-1.csv')) testFieldNames = testReader.fieldnames testData = [] for row in reader: rowData = [] for field in testFieldNames: if field in knn_disabled: continue else: rowData.append(row.get(field)) allData.append(rowData) for row in testReader: rowData = [] for field in testFieldNames: if field in knn_disabled: continue else: rowData.append(row.get(field)) testData.append(rowData) X = numpy.array(allData) Y = numpy.array(testData) nn = BallTree(X, leaf_size=30, metric='euclidean') result = nn.query(Y, k=50, return_distance=False) print(result) print('Finished discretization in {0} s'.format(time.time() - start_time))
def get_nearest(src_points, candidates, k_neighbors=1): # get_nearest and nearest_neighbor functions sourced from the following site: # https://automating-gis-processes.github.io/site/notebooks/L3/nearest-neighbor-faster.html """Find nearest neighbors for all source points from a set of candidate points""" print('balltree get nearest function - hsl') # Create tree from the candidate points tree = BallTree(candidates, leaf_size=15, metric='haversine') # Find closest points and distances distances, indices = tree.query(src_points, k=k_neighbors) # Transpose to get distances and indices into arrays distances = distances.transpose() indices = indices.transpose() # Get closest indices and distances (i.e. array at index 0) # note: for the second closest points, you would take index 1, etc. closest = indices[0] closest_dist = distances[0] # Return indices and distances return (closest, closest_dist)
def calc_nearest_site(): # Now we are going to use sklearn's KDTree to find the nearest neighbor of # each center for the nearest port. points_of_int = np.radians( df_centers.loc[:, ['average_lat', 'average_lon']].values) candidates = np.radians(ports_wpi.loc[:, ['lat', 'lon']].values) tree = BallTree(candidates, leaf_size=30, metric='haversine') ports_wpi = get_sites(engine) nearest_list = [] for i in range(len((points_of_int))): dist, ind = tree.query(points_of_int[i, :].reshape(1, -1), k=1) nearest_dict = { clust_id_value: df_centers.iloc[i].loc[clust_id_value], 'nearest_site_id': ports_wpi.iloc[ind[0][0]].loc['port_id'], 'nearest_port_dist': dist[0][0] * 6371.0088 } nearest_list.append(nearest_dict) df_nearest = pd.DataFrame(nearest_list) df_centers = pd.merge(df_centers, df_nearest, how='left', on=clust_id_value)
def nne(dim_red, true_labels): """ Calculates the nearest neighbor accuracy (basically leave-one-out cross validation with a 1NN classifier). Args: dim_red (array): dimensions (k, cells) true_labels (array): 1d array of integers Returns: Nearest neighbor accuracy - fraction of points for which the 1NN 1NN classifier returns the correct value. """ # use sklearn's BallTree bt = BallTree(dim_red.T) correct = 0 for i, l in enumerate(true_labels): dist, ind = bt.query([dim_red[:, i]], k=2) closest_cell = ind[0, 1] if true_labels[closest_cell] == l: correct += 1 return float(correct) / len(true_labels)
class PPMD: def __init__(self, features, labels, k=5): self.features = features self._kdtree = BallTree(features) self._y = labels self._k = k def majority(self, label_indices): assert len(label_indices) == self._k, "Did not get k inputs" # Get the labels of the k nearest neighbors knn_labels = [] for label_index in label_indices: knn_labels.append(self._y[label_index]) # Return the median of the next label median_vector = [] for i in range(len(knn_labels[0])): median_vector.append(np.median([row[i] for row in knn_labels])) return median_vector def classify(self, feature_set): # ind = self._kdtree.query_radius(feature_set.reshape(1, -1), r=1) dist, ind = self._kdtree.query(feature_set.reshape(1, -1), k=self._k) return self.majority(ind[0]) def error(self, prediction, truth, features): assert len(prediction) == len( truth ), "Number of predictions must equal the number of real data points." error = np.abs(np.array(prediction) - np.array(truth)) random_walk = np.abs(np.diff(features, axis=0)).sum() return np.array(error / ((len(prediction) / (len(features) - 1.0)) * random_walk)).sum()
def image_retrieval(): topK = 10 avg_acc = 0 x_train_noisy, x_test_noisy, y_train, y_test, x_train, x_test = preprocess( ) autoencoder = load_model('../working/autoencoder.h5') print(autoencoder.summary()) encoder = Model(autoencoder.input, autoencoder.get_layer('encoding_layer').output) coded_train = encoder.predict(x_train_noisy) coded_train = coded_train.reshape( coded_train.shape[0], coded_train.shape[1] * coded_train.shape[2] * coded_train.shape[3]) coded_train = preprocessing.normalize(coded_train, norm='l2') tree = BallTree(coded_train, leaf_size=200) #extracting features from test set coded_test = encoder.predict(x_test_noisy) coded_test = coded_test.reshape( coded_test.shape[0], coded_test.shape[1] * coded_test.shape[2] * coded_test.shape[3]) coded_test = preprocessing.normalize(coded_test, norm='l2') for i in range(coded_test.shape[0]): query_code = coded_test[i] query_label = y_test[i] dists, ids = tree.query([query_code], k=topK) labels = np.array([y_train[id] for id in ids[0]]) acc = (labels == query_label).astype(int).sum() / topK avg_acc += acc if i % 1000 == 0: print('{} / {}: {}'.format(i, coded_test.shape[0], acc)) avg_acc /= coded_test.shape[0] print("The average top K accuracy is: {}".format(avg_acc))
def spatial_lda_internal(adata_subset, x_coordinate, y_coordinate, phenotype, method, radius, knn, imageid): # Print which image is being processed print('Processing: ' + str(np.unique(adata_subset.obs[imageid]))) # Create a DataFrame with the necessary inforamtion data = pd.DataFrame({ 'x': adata_subset.obs[x_coordinate], 'y': adata_subset.obs[y_coordinate], 'phenotype': adata_subset.obs[phenotype] }) # Identify neighbourhoods based on the method used # a) KNN method if method == 'knn': print("Identifying the " + str(knn) + " nearest neighbours for every cell") tree = BallTree(data[['x', 'y']], leaf_size=2) ind = tree.query(data[['x', 'y']], k=knn, return_distance=False) # b) Local radius method if method == 'radius': print("Identifying neighbours within " + str(radius) + " pixels of every cell") kdt = BallTree(data[['x', 'y']], leaf_size=2) ind = kdt.query_radius(data[['x', 'y']], r=radius, return_distance=False) # Map phenotype phenomap = dict(zip(list(range(len(ind))), data['phenotype'])) # Used for mapping for i in range(len(ind)): ind[i] = [phenomap[letter] for letter in ind[i]] # return return ind
def nn_search(self, tree_features, query_features, metric='haversine', convert_radians=False): ''' Build a BallTree for nearest neighbor search based on haversine distance. Parameters ---------- tree_features: array_like Input features to create the search tree. Features are in lat, lon format, in radians query_features: array_like Points to which calculate the nearest neighbor within the tree. latlon coordinates expected in radians for distance calculation metric: str Distance metric for neighorhood search. Default haversine for latlon coordinates. convert_radians: bool Flag in case features are not in radians and need to be converted Returns ------- distances: array_like Array with the corresponding distance in km (haversine distance * earth radius) ''' if convert_radians: pass tree = BallTree(tree_features, metric=metric) return tree.query(query_features)[0] * 6371000 / 1000
def test_barnes_hut_angle(): # When Barnes-Hut's angle=0 this corresponds to the exact method. angle = 0.0 perplexity = 10 n_samples = 100 for n_components in [2, 3]: n_features = 5 degrees_of_freedom = float(n_components - 1.0) random_state = check_random_state(0) distances = random_state.randn(n_samples, n_features) distances = distances.astype(np.float32) distances = abs(distances.dot(distances.T)) np.fill_diagonal(distances, 0.0) params = random_state.randn(n_samples, n_components) P = _joint_probabilities(distances, perplexity, verbose=0) kl_exact, grad_exact = _kl_divergence(params, P, degrees_of_freedom, n_samples, n_components) k = n_samples - 1 bt = BallTree(distances) distances_nn, neighbors_nn = bt.query(distances, k=k + 1) neighbors_nn = neighbors_nn[:, 1:] distances_nn = np.array([distances[i, neighbors_nn[i]] for i in range(n_samples)]) assert np.all(distances[0, neighbors_nn[0]] == distances_nn[0]),\ abs(distances[0, neighbors_nn[0]] - distances_nn[0]) P_bh = _joint_probabilities_nn(distances_nn, neighbors_nn, perplexity, verbose=0) kl_bh, grad_bh = _kl_divergence_bh(params, P_bh, degrees_of_freedom, n_samples, n_components, angle=angle, skip_num_points=0, verbose=0) P = squareform(P) P_bh = P_bh.toarray() assert_array_almost_equal(P_bh, P, decimal=5) assert_almost_equal(kl_exact, kl_bh, decimal=3)
def get_nearest(src_points, candidates, k_neighbors=2): """ converts lat-long coords to great-circle distance and returns the two closests """ # Create tree from the candidate points tree = BallTree(candidates, leaf_size=20, metric='haversine') # Find closest points and distances distances, indices = tree.query(src_points, k=k_neighbors) # Transpose to get distances and indices into arrays distances = distances.transpose() indices = indices.transpose() #Get closest indices and distances (i.e. array at index 0) #note: for the second closest points, you would take index 1, etc. closest = indices[0:2] closest_dist = distances[0:2] return (closest, closest_dist)
def _rsl_prims_balltree(X, k=5, alpha=1.4142135623730951, metric='minkowski', p=2): if metric == 'minkowski': if p is None: raise TypeError('Minkowski metric given but no p value supplied!') if p < 0: raise ValueError('Minkowski metric with negative p value is not defined!') elif p is None: p = 2 # Unused, but needs to be integer; assume euclidean dim = X.shape[0] k = min(dim - 1, k) tree = BallTree(X, metric=metric) dist_metric = DistanceMetric.get_metric(metric) core_distances = tree.query(X, k=k)[0][:, -1] min_spanning_tree = mst_linkage_core_vector(X, core_distances, dist_metric, alpha) single_linkage_tree = label(min_spanning_tree) single_linkage_tree = SingleLinkageTree(single_linkage_tree) return single_linkage_tree
def associate(rad_1, rad_2, k_nn=1): """ Given two grids rad_1 and rad_2, this associates each point in rad_2 to the k-nearest neighbours in rad_1. Pairs of the form [latitude, longitude] """ # Room to improvement: # - Run the Ball tree on the smallest net # - Use something more efficient than a Ball Tree, like a binary search. # Build Ball Tree Ball = BallTree(rad_1, metric='haversine') # Searching Data distances, indices = Ball.query(rad_2, k=k_nn, breadth_first=True, return_distance=True) assert rad_2.shape[0] == indices.shape[0] return distances, indices
def distance_to_port(lon, lat, ports): ''' Take longitude and latitude and return the distance (km) to the closest port, as well as the country of that port, using the World Port Index database. This uses a ball tree search approach in radians, accounting for the curvature of the Earth by calculating the Haversine metric for each pair of points. Note that Haversine distance metric expects coordinate pairs in (lat, long) order, in radians. Arguments: lon, lat: Arrays of longitude-latitude pairs of ship locations, in degrees ports: shape file of ports Returns: Pandas dataframe with columns 'shore_country' and 'distance_to_port' ''' ports_flip = np.flip(ports, axis=1) coords = pd.concat([np.radians(lat), np.radians(lon)], axis=1) tree = BallTree(np.radians(ports_flip), metric='haversine') dist, ind = tree.query(coords, k=1) df_distance_to_port = pd.Series( dist.flatten() * 6371, # radius of earth (km) name='distance_to_port') return df_distance_to_port
class KNN(object): """ The KNN classifier """ def __init__(self, n_neighbors): self.n_neighbors = n_neighbors self.tree_ = None def fit(self, x_train, y_train): """ Fitting the KNN classifier Hint: Build a tree to get neighbors faster at test time """ self.tree_ = BallTree(x_train) self.y_train_ = y_train.reset_index(drop=True) return self def predict(self, x_test): """ Predicting the test data Hint: Get the K-Neighbors, then generate predictions using the labels of the neighbors """ n_test = x_test.shape[0] y_pred = [] _, indices = self.tree_.query(x_test, k=self.n_neighbors) assert (indices.shape[1] == self.n_neighbors) for i in range(n_test): neighbor_classes = self.y_train_[indices[i]] cl = Counter(neighbor_classes).most_common(1)[0][0] y_pred.append(cl) return np.array(y_pred)
class hyperSphere: def __init__(self, X): # constructs hypersphere self.nm = X.shape[0] self.nn_tree = BallTree(X, leaf_size=16, metric='euclidean') nn_dists, nn_ixs = self.nn_tree.query(X, k=2) # radii eps = 1e-10 self.radii = nn_dists[:, 1].flatten() + eps # isolation scores self.scores = 1.0 - (self.radii[nn_ixs[:, 1].flatten()] / self.radii) def compute_isolation_score(self, sphere, X): # compute isolation score for sample X n, _ = X.shape scores = np.ones(n, dtype=np.float) s_dists, s_ixs = sphere.nn_tree.query(X, k=self.nm) for i in range(n): cr = self.radii[s_ixs[i, :].flatten()] # belongs to these spheres ix_m = np.where(s_dists[i, :].flatten() <= cr)[0] # does not belong to sphere if len(ix_m) == 0: continue # sphere with smallest radius ixs = np.argmin(cr[ix_m]) ns = s_ixs[i, ixs] scores[i] = self.scores[ns] return scores
def index_nn_haversine(centroids, coordinates, threshold=THRESHOLD): """Compute the neareast centroid for each coordinate using a Ball tree with haversine distance. Parameters: centroids (2d array): First column contains latitude, second column contains longitude. Each row is a geographic point coordinates (2d array): First column contains latitude, second column contains longitude. Each row is a geographic point threshold (float): distance threshold in km over which no neighbor will be found. Those are assigned with a -1 index Returns: array with so many rows as coordinates containing the centroids indexes """ # Construct tree from centroids tree = BallTree(np.radians(centroids), metric='haversine') # Select unique exposures coordinates _, idx, inv = np.unique(coordinates, axis=0, return_index=True, return_inverse=True) # query the k closest points of the n_points using dual tree dist, assigned = tree.query(np.radians(coordinates[idx]), k=1, return_distance=True, dualtree=True, breadth_first=False) # Raise a warning if the minimum distance is greater than the # threshold and set an unvalid index -1 num_warn = np.sum(dist * EARTH_RADIUS_KM > threshold) if num_warn: LOGGER.warning('Distance to closest centroid is greater than %s' 'km for %s coordinates.', threshold, num_warn) assigned[dist * EARTH_RADIUS_KM > threshold] = -1 # Copy result to all exposures and return value return np.squeeze(assigned[inv])
def detectCloseSensor(src_points, candidates, k_neighbors=1): from sklearn.neighbors import BallTree """ Find nearest neighbors for all source points from a set of candidate points Parameters ---------- src_points : sao infrasestruturas candidates: sao sensores Returns ------- TYPE DESCRIPTION. """ # Create tree from the candidate points print("candidates", candidates) tree = BallTree(candidates, leaf_size=15, metric='haversine') # Find closest points and distances distances, indices = tree.query(src_points, k=k_neighbors) # Transpose to get distances and indices into arrays distances = distances.transpose() indices = indices.transpose() # Get closest indices and distances (i.e. array at index 0) # note: for the second closest points, you would take index 1, etc. indices[0] distances[0] closest = indices.ravel() closest_dist = distances.ravel() # Return indices and distances return (closest, closest_dist)
class GraphNearestNode(ContextManager): RAD_PER_DEGREE = pi / 180 EARTH_RADIUS_METERS = 6367.5 * 1e3 def __init__(self, graph): # Point array self.X = pd.DataFrame(data=nx.get_node_attributes(graph, "loc")).T # Nearest neighbors tree self.T = BallTree(self.X.values * self.RAD_PER_DEGREE, metric="haversine") def __call__(self, locs): # Get nearest nodes: distance to X and index in X (d, i) = np.squeeze( self.T.query(np.asarray(locs) * self.RAD_PER_DEGREE, k=1, return_distance=True)) # Note: do not sort the Series s = pd.Series(index=(self.X.index[list(map(int, i))]), data=(d * self.EARTH_RADIUS_METERS)) return s def __exit__(self, exc_type, exc_val, exc_tb): return None
def setUp(self): self.numPoints = 500 self.dim = 10 self.numNhbrs = 3 self.numQuery = 5 # Known Numpy implementation: np.random.seed(0) self.X = np.random.random( (self.numPoints, self.dim)) # 10 points in 3 dimensions tree = BallTree(self.X, leaf_size=2) dist, ind = tree.query(self.X[0:self.numQuery, ], k=self.numNhbrs) self.nrstNbhrs = self.X[ind, :] self.feeder5 = tf.placeholder(tf.float32, [self.numQuery, self.dim]) self.feeder1 = tf.placeholder(tf.float32, [1, self.dim]) with self.test_session() as sess: self.testKnn = tfknn(self.X.shape[0], self.X.shape[1], sess) for i in range(self.X.shape[0] / 100): addOp = self.testKnn.addPoints_np(self.X[100 * i:100 * (i + 1), ]) self.testKnn.compile() self.distMatrices = [] for points in self.nrstNbhrs: self.distMatrices.append(distance_matrix(points, points)) self.distMatrices = np.stack(self.distMatrices)
def get_nearest(src_points, candidates, k_neighbors=1, distance_threshold=None): """Find nearest neighbors for all source points from a set of candidate points Args: src_points: an pandas row with a geometry column candidates: pandas df k_neighbors: number of neighbors distance_threshold = minimum distance in meters """ # Create tree from the candidate points coordinates = np.vstack(candidates.geometry.centroid.apply(lambda geom: (geom.x,geom.y))) tree = BallTree(coordinates, leaf_size=15, metric='haversine') # Find closest points and distances #src_points = src_points.reset_index() src_x = src_points.geometry.centroid.x src_y = src_points.geometry.centroid.y src_points = np.array([src_x, src_y]).reshape(-1,2) #If there are not enough neighbors, reduce K, then pad to original if k_neighbors > candidates.shape[0]: effective_neighbors = candidates.shape[0] else: effective_neighbors = k_neighbors distances, indices = tree.query(src_points, k=effective_neighbors) neighbor_geoms = candidates[candidates.index.isin(indices[0])] neighbor_geoms["distance"] = distances[0] if distance_threshold: neighbor_geoms = neighbor_geoms[neighbor_geoms.distance > distance_threshold] # Return indices and distances return neighbor_geoms