def test_ball_tree_pickle(): np.random.seed(0) X = np.random.random((10, 3)) bt1 = BallTree(X, leaf_size=1) # Test if BallTree with callable metric is picklable bt1_pyfunc = BallTree(X, metric=dist_func, leaf_size=1, p=2) ind1, dist1 = bt1.query(X) ind1_pyfunc, dist1_pyfunc = bt1_pyfunc.query(X) def check_pickle_protocol(protocol): s = pickle.dumps(bt1, protocol=protocol) bt2 = pickle.loads(s) s_pyfunc = pickle.dumps(bt1_pyfunc, protocol=protocol) bt2_pyfunc = pickle.loads(s_pyfunc) ind2, dist2 = bt2.query(X) ind2_pyfunc, dist2_pyfunc = bt2_pyfunc.query(X) assert_array_almost_equal(ind1, ind2) assert_array_almost_equal(dist1, dist2) assert_array_almost_equal(ind1_pyfunc, ind2_pyfunc) assert_array_almost_equal(dist1_pyfunc, dist2_pyfunc) for protocol in (0, 1, 2): yield check_pickle_protocol, protocol
def test_ball_tree_pickle(): rng = check_random_state(0) X = rng.random_sample((10, 3)) bt1 = BallTree(X, leaf_size=1) # Test if BallTree with callable metric is picklable bt1_pyfunc = BallTree(X, metric=dist_func, leaf_size=1, p=2) ind1, dist1 = bt1.query(X) ind1_pyfunc, dist1_pyfunc = bt1_pyfunc.query(X) def check_pickle_protocol(protocol): s = pickle.dumps(bt1, protocol=protocol) bt2 = pickle.loads(s) s_pyfunc = pickle.dumps(bt1_pyfunc, protocol=protocol) bt2_pyfunc = pickle.loads(s_pyfunc) ind2, dist2 = bt2.query(X) ind2_pyfunc, dist2_pyfunc = bt2_pyfunc.query(X) assert_array_almost_equal(ind1, ind2) assert_array_almost_equal(dist1, dist2) assert_array_almost_equal(ind1_pyfunc, ind2_pyfunc) assert_array_almost_equal(dist1_pyfunc, dist2_pyfunc) assert isinstance(bt2, BallTree) for protocol in (0, 1, 2): check_pickle_protocol(protocol)
def lof(X, k, outlier_threshold=1.5, verbose=False): """Knn with KD trees""" start = time.time() tree = BallTree(X, leaf_size=2) distance, index = tree.query(X, k) distance, index = distance[:, 1:], index[:, 1:] radius = distance[:, -1] """Calculate LRD.""" LRD = np.mean(np.maximum(distance, radius[index]), axis=1) r = 1. / np.array(LRD) """Calculate outlier score.""" outlier_score = np.sum(r[index], axis=1) / np.array(r, dtype=np.float16) outlier_score *= 1. / k # print ('Compute time: %g seconds.' % ((time.time() - start))) if verbose: print("Recording all outliers with outlier score greater than %s." \ % (outlier_threshold)) outliers = [] """ Could parallelize this for loop, but really not worth the overhead... Would get insignificant performance gain.""" for i, score in enumerate(outlier_score): if score > outlier_threshold: outliers.append([i, X[i], score]) if verbose: print("Detected outliers:") print(outliers) return outliers
def experiment_setup(sat_pos, altitude, src_pos, gst_pos, min_elev, orbits, sat_per_orbit, terrestrial_gst_graph, path_control): sat_sat_dist = compute_sat_sat_distance(sat_pos, altitude, orbits, sat_per_orbit) # Compute the BallTree for the satellites. This gives nn to satellites. sat_tree = BallTree(np.deg2rad(sat_pos), metric=DistanceMetric.get_metric("haversine")) # Get the satellites that are in reach for the ground stations # and their distance. sat_gst_ind, sat_gst_dist = compute_gst_sat_distance( altitude, min_elev, gst_pos, sat_tree) # Compute the terrestrial nearest neighbors to sources src_gst_ind, src_gst_dist = src_nearest_gst_distance( src_pos, gst_pos, path_control) # Get the terrestrial GST -> GST distance gst_gst_terrestrial = gst_gst_terrestrial_distance(terrestrial_gst_graph, gst_pos) # Get the satellite GST -> GST distance gst_gst_satellite = gsts_optimization(sat_gst_ind, sat_gst_dist, sat_sat_dist, n_gsts=gst_pos.shape[0]) return src_gst_ind, src_gst_dist, gst_gst_terrestrial, gst_gst_satellite
def test_ball_tree_kde(n_samples=100, n_features=3): np.random.seed(0) X = np.random.random((n_samples, n_features)) Y = np.random.random((n_samples, n_features)) bt = BallTree(X, leaf_size=10) for kernel in [ 'gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear', 'cosine' ]: for h in [0.01, 0.1, 1]: dens_true = compute_kernel_slow(Y, X, kernel, h) def check_results(kernel, h, atol, rtol, breadth_first): dens = bt.kernel_density(Y, h, atol=atol, rtol=rtol, kernel=kernel, breadth_first=breadth_first) assert_allclose(dens, dens_true, atol=atol, rtol=max(rtol, 1e-7)) for rtol in [0, 1E-5]: for atol in [1E-6, 1E-2]: for breadth_first in (True, False): yield (check_results, kernel, h, atol, rtol, breadth_first)
def src_nearest_gst_distance(src_pos, gst_pos, nn=1): """INCLUDES PATH STRETCH""" gst_tree = BallTree(np.deg2rad(gst_pos), metric=DistanceMetric.get_metric("haversine")) src_gst_dist, src_gst_ind = gst_tree.query(np.deg2rad(src_pos), k=nn) src_gst_dist = haversine_to_km(src_gst_dist) src_gst_dist = src_gst_dist * FIBER_PATH_STRETCH return src_gst_ind, src_gst_dist
def test_query_haversine(): rng = check_random_state(0) X = 2 * np.pi * rng.random_sample((40, 2)) bt = BallTree(X, leaf_size=1, metric='haversine') dist1, ind1 = bt.query(X, k=5) dist2, ind2 = brute_force_neighbors(X, X, k=5, metric='haversine') assert_array_almost_equal(dist1, dist2) assert_array_almost_equal(ind1, ind2)
def test_query_haversine(): np.random.seed(0) X = 2 * np.pi * np.random.random((40, 2)) bt = BallTree(X, leaf_size=1, metric='haversine') dist1, ind1 = bt.query(X, k=5) dist2, ind2 = brute_force_neighbors(X, X, k=5, metric='haversine') assert_array_almost_equal(dist1, dist2) assert_array_almost_equal(ind1, ind2)
def check_neighbors(dualtree, breadth_first, k, metric, kwargs): bt = BallTree(X, leaf_size=1, metric=metric, **kwargs) dist1, ind1 = bt.query(Y, k, dualtree=dualtree, breadth_first=breadth_first) dist2, ind2 = brute_force_neighbors(X, Y, k, metric, **kwargs) # don't check indices here: if there are any duplicate distances, # the indices may not match. Distances should not have this problem. assert_array_almost_equal(dist1, dist2)
def _fit(self, X): #use Euclidean metric if possible, or raise error [IY] #note that in sompy.project_realdata() the algorithm is set by default # (e.g. to 'brute' or 'kd_tree') if self.metric_params is None: self.effective_metric_params_ = {} else: self.effective_metric_params_ = self.metric_params.copy() if self.metric not in ['euclidean', 'minkowski']: raise ValueError("Using Euclidean distance with the wrong metric") self.effective_metric_ = self.metric # For minkowski distance, use more efficient methods where available if self.metric == 'minkowski': p = self.effective_metric_params_.pop('p', 2) if p == 2: self.effective_metric_ = 'euclidean' else: raise ValueError( "cannot replace Minkowski with Euclidian metric") X = check_array(X, accept_sparse='csr') n_samples = X.shape[0] if n_samples == 0: raise ValueError("n_samples must be greater than 0") if issparse(X) and self.effective_metric_ not in VALID_METRICS_SPARSE[ 'brute']: raise ValueError("metric '%s' not valid for sparse input" % self.effective_metric_) self._fit_method = self.algorithm self._fit_X = X if self._fit_method == 'ball_tree': self._tree = BallTree(X, self.leaf_size, metric=self.effective_metric_, **self.effective_metric_params_) elif self._fit_method == 'kd_tree': self._tree = KDTree(X, self.leaf_size, metric=self.effective_metric_, **self.effective_metric_params_) elif self._fit_method == 'brute': self._tree = None else: raise ValueError("algorithm = '%s' not recognized" % self.algorithm) if self.n_neighbors is not None: if self.n_neighbors <= 0: raise ValueError("Expected n_neighbors > 0. Got %d" % self.n_neighbors) return self
def __init__(self, sen2vec, corpus_path, corpus_vec_path): self.sen2vec = sen2vec self._corpus = pd.read_csv(corpus_path) self._vectors = load_qa_corpus_vec(corpus_vec_path) self._indices = [] X = [] for i, v in enumerate(self._vectors): if any(v): self._indices.append(i) X.append(v) X = np.array(X) # 构建balltree self.tree = BallTree(X)
def test_gaussian_kde(n_samples=1000): # Compare gaussian KDE results to scipy.stats.gaussian_kde from scipy.stats import gaussian_kde rng = check_random_state(0) x_in = rng.normal(0, 1, n_samples) x_out = np.linspace(-5, 5, 30) for h in [0.01, 0.1, 1]: bt = BallTree(x_in[:, None]) gkde = gaussian_kde(x_in, bw_method=h / np.std(x_in)) dens_bt = bt.kernel_density(x_out[:, None], h) / n_samples dens_gkde = gkde.evaluate(x_out) assert_array_almost_equal(dens_bt, dens_gkde, decimal=3)
def test_ball_tree_query_metrics(metric): rng = check_random_state(0) if metric in BOOLEAN_METRICS: X = rng.random_sample((40, 10)).round(0) Y = rng.random_sample((10, 10)).round(0) elif metric in DISCRETE_METRICS: X = (4 * rng.random_sample((40, 10))).round(0) Y = (4 * rng.random_sample((10, 10))).round(0) k = 5 bt = BallTree(X, leaf_size=1, metric=metric) dist1, ind1 = bt.query(Y, k) dist2, ind2 = brute_force_neighbors(X, Y, k, metric) assert_array_almost_equal(dist1, dist2)
def test_ball_tree_pickle(): import pickle np.random.seed(0) X = np.random.random((10, 3)) bt1 = BallTree(X, leaf_size=1) ind1, dist1 = bt1.query(X) def check_pickle_protocol(protocol): s = pickle.dumps(bt1, protocol=protocol) bt2 = pickle.loads(s) ind2, dist2 = bt2.query(X) assert_array_almost_equal(ind1, ind2) assert_array_almost_equal(dist1, dist2) for protocol in (0, 1, 2): yield check_pickle_protocol, protocol
def test_ball_tree_two_point(n_samples=100, n_features=3): np.random.seed(0) X = np.random.random((n_samples, n_features)) Y = np.random.random((n_samples, n_features)) r = np.linspace(0, 1, 10) bt = BallTree(X, leaf_size=10) D = DistanceMetric.get_metric("euclidean").pairwise(Y, X) counts_true = [(D <= ri).sum() for ri in r] def check_two_point(r, dualtree): counts = bt.two_point_correlation(Y, r=r, dualtree=dualtree) assert_array_almost_equal(counts, counts_true) for dualtree in (True, False): yield check_two_point, r, dualtree
def test_ball_tree_query(metric, k, dualtree, breadth_first): rng = check_random_state(0) X = rng.random_sample((40, DIMENSION)) Y = rng.random_sample((10, DIMENSION)) kwargs = METRICS[metric] bt = BallTree(X, leaf_size=1, metric=metric, **kwargs) dist1, ind1 = bt.query(Y, k, dualtree=dualtree, breadth_first=breadth_first) dist2, ind2 = brute_force_neighbors(X, Y, k, metric, **kwargs) # don't check indices here: if there are any duplicate distances, # the indices may not match. Distances should not have this problem. assert_array_almost_equal(dist1, dist2)
def query(self, X: np.ndarray, k: Optional[int] = None) -> np.ndarray: """ Returns the k nearest neighbors. Parameters: X: An array of shape (num_samples, num_features). k: The number of neighbors to return. Returns: An array of shape (num_samples, k) and of type int containing the indices of the k nearest nodes. """ if k is None: k = self._k bt = BallTree(self.nodes, metric="euclidean") dist, ind = bt.query(X, k) return ind
def test_ball_tree_query_radius(n_samples=100, n_features=10): np.random.seed(0) X = 2 * np.random.random(size=(n_samples, n_features)) - 1 query_pt = np.zeros(n_features, dtype=float) eps = 1E-15 # roundoff error can cause test to fail bt = BallTree(X, leaf_size=5) rad = np.sqrt(((X - query_pt) ** 2).sum(1)) for r in np.linspace(rad[0], rad[-1], 100): ind = bt.query_radius(query_pt, r + eps)[0] i = np.where(rad <= r + eps)[0] ind.sort() i.sort() assert_array_almost_equal(i, ind)
def test_ball_tree_kde(n_samples=100, n_features=3): rng = check_random_state(0) X = rng.random_sample((n_samples, n_features)) Y = rng.random_sample((n_samples, n_features)) bt = BallTree(X, leaf_size=10) for kernel in [ 'gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear', 'cosine' ]: for h in [0.01, 0.1, 1]: dens_true = compute_kernel_slow(Y, X, kernel, h) for rtol in [0, 1E-5]: for atol in [1E-6, 1E-2]: for breadth_first in (True, False): yield (check_results, kernel, h, atol, rtol, breadth_first, bt, Y, dens_true)
def test_gaussian_kde(n_samples=1000): """Compare gaussian KDE results to scipy.stats.gaussian_kde""" from scipy.stats import gaussian_kde np.random.seed(0) x_in = np.random.normal(0, 1, n_samples) x_out = np.linspace(-5, 5, 30) for h in [0.01, 0.1, 1]: bt = BallTree(x_in[:, None]) try: gkde = gaussian_kde(x_in, bw_method=h / np.std(x_in)) except TypeError: raise SkipTest("Old version of scipy, doesn't accept explicit bandwidth.") dens_bt = bt.kernel_density(x_out[:, None], h) / n_samples dens_gkde = gkde.evaluate(x_out) assert_array_almost_equal(dens_bt, dens_gkde, decimal=3)
def test_ball_tree_query_radius_distance(n_samples=100, n_features=10): np.random.seed(0) X = 2 * np.random.random(size=(n_samples, n_features)) - 1 query_pt = np.zeros(n_features, dtype=float) eps = 1E-15 # roundoff error can cause test to fail bt = BallTree(X, leaf_size=5) rad = np.sqrt(((X - query_pt) ** 2).sum(1)) for r in np.linspace(rad[0], rad[-1], 100): ind, dist = bt.query_radius(query_pt, r + eps, return_distance=True) ind = ind[0] dist = dist[0] d = np.sqrt(((query_pt - X[ind]) ** 2).sum(1)) assert_array_almost_equal(d, dist)
def test_gaussian_kde(n_samples=1000): """Compare gaussian KDE results to scipy.stats.gaussian_kde""" from scipy.stats import gaussian_kde np.random.seed(0) x_in = np.random.normal(0, 1, n_samples) x_out = np.linspace(-5, 5, 30) for h in [0.01, 0.1, 1]: bt = BallTree(x_in[:, None]) try: gkde = gaussian_kde(x_in, bw_method=h / np.std(x_in)) except TypeError: # older versions of scipy don't accept explicit bandwidth raise SkipTest dens_bt = bt.kernel_density(x_out[:, None], h) / n_samples dens_gkde = gkde.evaluate(x_out) assert_allclose(dens_bt, dens_gkde, rtol=1E-3, atol=1E-3)
def optimize_end_to_end_latency_rerouting(sat_pos, altitude, gst_pos, src_pos, min_elev, orbits, sat_per_orbit, terrestrial_gst_graph, inactive): # Compute satellite graph distances sat_sat_dist = compute_sat_sat_distance(sat_pos, altitude, orbits, sat_per_orbit) # Compute the BallTree for the satellites. This gives nn to satellites. sat_tree = BallTree(np.deg2rad(sat_pos), metric=DistanceMetric.get_metric("haversine")) # Get the satellites that are in reach for the ground stations # and their distance. sat_gst_ind, sat_gst_dist = compute_gst_sat_distance( altitude, min_elev, gst_pos, sat_tree) # Get the terrestrial GST -> GST distance gst_gst_terrestrial = gst_gst_terrestrial_distance(terrestrial_gst_graph, gst_pos) # Get the satellite GST -> GST distance gst_gst_satellite = gsts_optimization(sat_gst_ind, sat_gst_dist, sat_sat_dist, n_gsts=gst_pos.shape[0]) # Compute the closest active GST to the inactive ones nearest_active, nearest_active_dist = inactive_to_closest_active( inactive, gst_gst_terrestrial) # Get the closest GST to every source and its distance src_gst_ind, src_gst_dist = src_nearest_gst_distance(src_pos, gst_pos) # Put all together and get the src-dst distance matrix n_src = src_pos.shape[0] src_dst_latency = compute_src_dst_latency(n_src, inactive, src_gst_ind, src_gst_dist, nearest_active, nearest_active_dist, gst_gst_satellite) return src_dst_latency, nearest_active
def rank(self, cs, yc, ls, lss): targets = {l: i for (i, l) in enumerate(ls)} # Number of results (lemmas) ranked n_results = len(yc) # Build ball tree model ball_tree = BallTree(yc) rs = ball_tree.query(cs, k=n_results, return_distance=False) rankings = list() for i, (ranking, ls) in enumerate(zip(rs, lss)): lsm = [targets[l] for l in ls] ranking_array = np.array([(1.0 if i in lsm else 0.0) for i in ranking]) rankings.append(ranking_array) return rankings
def create_image(self, path, max_size, metric): """ Match an image with itself finding the closest neighbors within that image """ # Open image img_data = imaging.open_img(path, max_size) # Get descriptors keypoints, descriptors = matchutil.get_features(img_data) # Match matches = matchutil.flann_match(descriptors, descriptors, k=2) # Distances and positions distances = numpy.array([r[1].distance for r in matches]) positions = numpy.array([k.pt for k in keypoints]) # build position_tree position_tree = BallTree(positions, metric=metric) # Collect data self.original = { "descriptors": descriptors, "positions": positions, "distances": distances, "position_tree": position_tree, "size": img_data.shape }
def test_ball_tree_kde(kernel, h, rtol, atol, breadth_first, n_samples=100, n_features=3): rng = np.random.RandomState(0) X = rng.random_sample((n_samples, n_features)) Y = rng.random_sample((n_samples, n_features)) bt = BallTree(X, leaf_size=10) dens_true = compute_kernel_slow(Y, X, kernel, h) dens = bt.kernel_density(Y, h, atol=atol, rtol=rtol, kernel=kernel, breadth_first=breadth_first) assert_allclose(dens, dens_true, atol=atol, rtol=max(rtol, 1e-7))
def _fit(self, X): self._check_algorithm_metric() self._check_hubness_algorithm() self._check_algorithm_hubness_compatibility() if self.metric_params is None: self.effective_metric_params_ = {} else: self.effective_metric_params_ = self.metric_params.copy() effective_p = self.effective_metric_params_.get('p', self.p) if self.metric in ['wminkowski', 'minkowski']: self.effective_metric_params_['p'] = effective_p self.effective_metric_ = self.metric # For minkowski distance, use more efficient methods where available if self.metric == 'minkowski': p = self.effective_metric_params_.pop('p', 2) if p <= 0: raise ValueError( f"p must be greater than one for minkowski metric, " f"or in ]0, 1[ for fractional norms.") elif p == 1: self.effective_metric_ = 'manhattan' elif p == 2: self.effective_metric_ = 'euclidean' elif p == np.inf: self.effective_metric_ = 'chebyshev' else: self.effective_metric_params_['p'] = p if isinstance(X, NeighborsBase): self._fit_X = X._fit_X self._tree = X._tree self._fit_method = X._fit_method self._index = X._index self._hubness_reduction = X._hubness_reduction return self elif isinstance(X, BallTree): self._fit_X = X.data self._tree = X self._fit_method = 'ball_tree' return self elif isinstance(X, KDTree): self._fit_X = X.data self._tree = X self._fit_method = 'kd_tree' return self elif isinstance(X, ApproximateNearestNeighbor): self._tree = None if isinstance(X, PuffinnLSH): self._fit_X = X.X_train_ self._fit_method = 'lsh' elif isinstance(X, FalconnLSH): self._fit_X = X.X_train_ self._fit_method = 'falconn_lsh' elif isinstance(X, ONNG): self._fit_method = 'onng' elif isinstance(X, HNSW): self._fit_method = 'hnsw' elif isinstance(X, RandomProjectionTree): self._fit_method = 'rptree' self._index = X # TODO enable hubness reduction here ... return self X = check_array(X, accept_sparse='csr') n_samples = X.shape[0] if n_samples == 0: raise ValueError( f"n_samples must be greater than 0 (but was {n_samples}.") if issparse(X): if self.algorithm not in ('auto', 'brute'): warnings.warn("cannot use tree with sparse input: " "using brute force") if self.effective_metric_ not in VALID_METRICS_SPARSE['brute'] \ and not callable(self.effective_metric_): raise ValueError( f"Metric '{self.effective_metric_}' not valid for sparse input. " f"Use sorted(sklearn.neighbors.VALID_METRICS_SPARSE['brute']) " f"to get valid options. Metric can also be a callable function." ) self._fit_X = X.copy() self._tree = None self._fit_method = 'brute' if self.hubness is not None: warnings.warn( f'cannot use hubness reduction with tree: disabling hubness reduction.' ) self.hubness = None self._hubness_reduction_method = None self._hubness_reduction = NoHubnessReduction() return self self._fit_method = self.algorithm self._fit_X = X self._hubness_reduction_method = self.hubness if self._fit_method == 'auto': # A tree approach is better for small number of neighbors, # and KDTree is generally faster when available if ((self.n_neighbors is None or self.n_neighbors < self._fit_X.shape[0] // 2) and self.metric != 'precomputed'): if self.effective_metric_ in VALID_METRICS['kd_tree']: self._fit_method = 'kd_tree' elif (callable(self.effective_metric_) or self.effective_metric_ in VALID_METRICS['ball_tree']): self._fit_method = 'ball_tree' else: self._fit_method = 'brute' else: self._fit_method = 'brute' self._index = None if self._fit_method == 'ball_tree': self._tree = BallTree(X, self.leaf_size, metric=self.effective_metric_, **self.effective_metric_params_) self._index = None elif self._fit_method == 'kd_tree': self._tree = KDTree(X, self.leaf_size, metric=self.effective_metric_, **self.effective_metric_params_) self._index = None elif self._fit_method == 'brute': self._tree = None self._index = None elif self._fit_method == 'lsh': self._index = PuffinnLSH(verbose=self.verbose, **self.algorithm_params) self._index.fit(X) self._tree = None elif self._fit_method == 'falconn_lsh': self._index = FalconnLSH(verbose=self.verbose, **self.algorithm_params) self._index.fit(X) self._tree = None elif self._fit_method == 'onng': self._index = ONNG(verbose=self.verbose, **self.algorithm_params) self._index.fit(X) self._tree = None elif self._fit_method == 'hnsw': self._index = HNSW(verbose=self.verbose, **self.algorithm_params) self._index.fit(X) self._tree = None elif self._fit_method == 'rptree': self._index = RandomProjectionTree(verbose=self.verbose, **self.algorithm_params) self._index.fit(X) self._tree = None # because it's a tree, but not an sklearn tree... else: raise ValueError(f"algorithm = '{self.algorithm}' not recognized") if self._hubness_reduction_method is None: self._hubness_reduction = NoHubnessReduction() else: n_candidates = self.algorithm_params['n_candidates'] if 'include_self' in self.kwargs and self.kwargs['include_self']: neigh_train = self.kcandidates(X, n_neighbors=n_candidates, return_distance=True) else: neigh_train = self.kcandidates(n_neighbors=n_candidates, return_distance=True) # Remove self distances neigh_dist_train = neigh_train[0] # [:, 1:] neigh_ind_train = neigh_train[1] # [:, 1:] if self._hubness_reduction_method == 'ls': self._hubness_reduction = LocalScaling(verbose=self.verbose, **self.hubness_params) elif self._hubness_reduction_method == 'mp': self._hubness_reduction = MutualProximity( verbose=self.verbose, **self.hubness_params) elif self._hubness_reduction_method == 'dsl': self._hubness_reduction = DisSimLocal(verbose=self.verbose, **self.hubness_params) elif self._hubness_reduction_method == 'snn': raise NotImplementedError('feature not yet implemented') elif self._hubness_reduction_method == 'simhubin': raise NotImplementedError('feature not yet implemented') else: raise ValueError( f'Hubness reduction algorithm = "{self._hubness_reduction_method}" not recognized.' ) self._hubness_reduction.fit(neigh_dist_train, neigh_ind_train, X=X, assume_sorted=False) if self.n_neighbors is not None: if self.n_neighbors <= 0: raise ValueError( f"Expected n_neighbors > 0. Got {self.n_neighbors:d}") else: if not np.issubdtype(type(self.n_neighbors), np.integer): raise TypeError( f"n_neighbors does not take {type(self.n_neighbors)} value, " f"enter integer value") return self
def check_neighbors(metric): bt = BallTree(X, leaf_size=1, metric=metric) dist1, ind1 = bt.query(Y, k) dist2, ind2 = brute_force_neighbors(X, Y, k, metric) assert_array_almost_equal(dist1, dist2)
def get_ball_tree_index(X): return BallTree(X)
#print(WandV['pressure']) #X = np.array((WandV.values())) #print(len(WandV.values())) #print(WandV.values()) import pandas as pd df = pd.DataFrame() for i in WandV.values(): #print(pd.DataFrame(i)) df = df.append(pd.Series(i), ignore_index=True) #print("temp head",df.head()) #print("temp shape", df.shape) from sklearn.neighbors.ball_tree import BallTree print("KNN ...........") tree = BallTree(df, leaf_size=2) print("finding neighbor words .....") dist, ind = tree.query(df[:1], k=3) # doctest: +SKIP print(ind) # indices of 3 closest neighbors #[0 3 1] print(dist) # distances to 3 closest neighbors #[ 0. 0.19662693 0.29473397] v1 = df.iloc[0, :] v2 = df.iloc[363, :] v3 = df.iloc[3774, :] V1 = np.array(v1) V2 = np.array(v2) V3 = np.array(v3)