def plot_nb_dists(X, nearest_neighbor, metric='euclidean', ylim=None): """ Plots distance sorted by `neared_neighbor`th Args: X (list of lists): list with data tuples nearest_neighbor (int): nr of nearest neighbor to plot metric (string): name of scipy metric function to use """ tree = KDTree(X, leaf_size=2) if not isinstance(nearest_neighbor, list): nearest_neighbor = [nearest_neighbor] max_nn = max(nearest_neighbor) dist, _ = tree.query(X, k=max_nn + 1) plt.figure() for nnb in nearest_neighbor: col = dist[:, nnb] col.sort() plt.plot(col, label="{}th nearest neighbor".format(nnb)) #plt.ylim(0, min(250, max(dist[:, max_nn]))) plt.ylabel("Distance to k nearest neighbor") plt.xlabel("Points sorted according to distance of k nearest neighbor") plt.ylim(0, ylim) plt.grid() plt.legend() plt.show()
def generate_pairs(patches, constants): """Generate pairs for normalized patches.""" k_nearest = constants.K_NEAREST num_patches = constants.NUM_QUERY_PATCHES scaled_imgs = len(patches) pairs = [] query_database = [] candidate_database = [] index_database = [] length_database = [] for k in range(scaled_imgs): qp = [ patch.norm_patch for patch in patches[k] if 7 <= patch.bucket <= 9 ] qi = [ index for index, patch in enumerate(patches[k]) if 7 <= patch.bucket <= 9 ] # Choose lesser query patches through random selection to improve speed if len(qi) > num_patches: np.random.seed(0) selection = np.random.choice(np.arange(len(qi)), num_patches, replace=False).tolist() selection.sort() query_patches = [qp[i] for i in selection] query_indices = [qi[i] for i in selection] else: query_patches = qp query_indices = qi query_database.append(np.vstack([query_patches])) index_database.append(query_indices) length_database.append(len(query_indices)) candidate_database.append( np.vstack([[ patch.norm_patch for i, patch in enumerate(patches[k]) if 0 <= patch.bucket <= 5 ]])) p1 = np.concatenate(candidate_database) kdt = KDTree(p1, leaf_size=30, metric='euclidean') # Find list of nearest neighbours for each patch # `total` is used to correct indices of queried patches for every iteration total = 0 for k in range(scaled_imgs): nn = kdt.query(query_database[k], k=k_nearest, return_distance=False, sort_results=False) q = [total + index_database[k][i] for i in range(length_database[k])] for i in range(len(nn)): for j in range(k_nearest): pairs.append([q[i], nn[i][j]]) total += len(patches[k]) return pairs
def check_neighbors(dualtree, breadth_first, k, metric, kwargs): kdt = KDTree(X, leaf_size=1, metric=metric, **kwargs) dist1, ind1 = kdt.query(Y, k, dualtree=dualtree, breadth_first=breadth_first) dist2, ind2 = brute_force_neighbors(X, Y, k, metric, **kwargs) # don't check indices here: if there are any duplicate distances, # the indices may not match. Distances should not have this problem. assert_allclose(dist1, dist2)
def __call__(self, x, ma): h = F.tanh(self.l0(x)) #h = F.tanh(self.l1(h)) #h = F.tanh(self.l2(h)) #kd_tree q_train = [] #for train [variable,variable] ind_list = [] #for train dist_list = [] #for train for j in range(len(ma.maq)): #loop n_actions h_list = ma.mah[j] lp = len(h_list) leaf_size = lp + (lp / 2) tree = KDTree(h_list, leaf_size=leaf_size) h_ = h.data if lp < 50: k = lp else: k = 50 dist, ind = tree.query(h_, k=k) count = 0 for ii in ind[0]: mahi = np.zeros((1, 4), dtype=np.float32) mahi[0] = ma.mah[j][ii] hi = chainer.Variable(cuda.to_cpu(mahi)) wi = F.expand_dims( 1 / (F.batch_l2_norm_squared((h - hi)) + 0.001), 1) if count == 0: w = wi maqi = np.zeros((1, 1), dtype=np.float32) maqi[0] = ma.maq[j][ii] q = chainer.Variable(cuda.to_cpu(maqi)) qq = wi * q count += 1 else: w += wi maqi = np.zeros((1, 1), dtype=np.float32) maqi[0] = ma.maq[j][ii] q = chainer.Variable(cuda.to_cpu(maqi)) qq += wi * q qq /= w q_train.append(qq) ind_list.append(ind) dist_list.append(dist) self.q_list[0][j] = qq.data[0][0] qa = chainer.Variable(cuda.to_cpu(self.q_list)) return chainerrl.action_value.DiscreteActionValue( qa), q_train, ind_list, dist_list, h.data
def test_kd_tree_two_point(dualtree): n_samples, n_features = (100, 3) rng = check_random_state(0) X = rng.random_sample((n_samples, n_features)) Y = rng.random_sample((n_samples, n_features)) r = np.linspace(0, 1, 10) kdt = KDTree(X, leaf_size=10) D = DistanceMetric.get_metric("euclidean").pairwise(Y, X) counts_true = [(D <= ri).sum() for ri in r] counts = kdt.two_point_correlation(Y, r=r, dualtree=dualtree) assert_array_almost_equal(counts, counts_true)
def eps_neighbourhood(X, index, eps, metric): """ Query for neighbors within a given radius. :param X: data :param index: index position of point in data :param eps: looking for points inside radius eps :param metric: distance metric :return: vector of indices """ tree = KDTree(X, leaf_size=2, metric=metric) indices = tree.query_radius([X[index]], r=eps) return indices[0]
def test_kd_tree_pickle(protocol): import pickle rng = check_random_state(0) X = rng.random_sample((10, 3)) kdt1 = KDTree(X, leaf_size=1) ind1, dist1 = kdt1.query(X) def check_pickle_protocol(protocol): s = pickle.dumps(kdt1, protocol=protocol) kdt2 = pickle.loads(s) ind2, dist2 = kdt2.query(X) assert_array_almost_equal(ind1, ind2) assert_array_almost_equal(dist1, dist2) check_pickle_protocol(protocol)
def test_gaussian_kde(n_samples=1000): # Compare gaussian KDE results to scipy.stats.gaussian_kde from scipy.stats import gaussian_kde rng = check_random_state(0) x_in = rng.normal(0, 1, n_samples) x_out = np.linspace(-5, 5, 30) for h in [0.01, 0.1, 1]: kdt = KDTree(x_in[:, None]) gkde = gaussian_kde(x_in, bw_method=h / np.std(x_in)) dens_kdt = kdt.kernel_density(x_out[:, None], h) / n_samples dens_gkde = gkde.evaluate(x_out) assert_array_almost_equal(dens_kdt, dens_gkde, decimal=3)
def test_kd_tree_pickle(): import pickle np.random.seed(0) X = np.random.random((10, 3)) kdt1 = KDTree(X, leaf_size=1) ind1, dist1 = kdt1.query(X) def check_pickle_protocol(protocol): s = pickle.dumps(kdt1, protocol=protocol) kdt2 = pickle.loads(s) ind2, dist2 = kdt2.query(X) assert_array_almost_equal(ind1, ind2) assert_array_almost_equal(dist1, dist2) for protocol in (0, 1, 2): yield check_pickle_protocol, protocol
def test_kd_tree_pickle(): import pickle np.random.seed(0) X = np.random.random((10, 3)) kdt1 = KDTree(X, leaf_size=1) ind1, dist1 = kdt1.query(X) def check_pickle_protocol(protocol): s = pickle.dumps(kdt1, protocol=protocol) kdt2 = pickle.loads(s) ind2, dist2 = kdt2.query(X) assert_allclose(ind1, ind2) assert_allclose(dist1, dist2) for protocol in (0, 1, 2): yield check_pickle_protocol, protocol
def test_kd_tree_kde(n_samples=100, n_features=3): np.random.seed(0) X = np.random.random((n_samples, n_features)) Y = np.random.random((n_samples, n_features)) kdt = KDTree(X, leaf_size=10) for kernel in [ 'gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear', 'cosine' ]: for h in [0.01, 0.1, 1]: dens_true = compute_kernel_slow(Y, X, kernel, h) def check_results(kernel, h, atol, rtol, breadth_first): dens = kdt.kernel_density(Y, h, atol=atol, rtol=rtol, kernel=kernel, breadth_first=breadth_first) assert_allclose(dens, dens_true, atol=atol, rtol=max(rtol, 1e-7)) for rtol in [0, 1E-5]: for atol in [1E-6, 1E-2]: for breadth_first in (True, False): yield (check_results, kernel, h, atol, rtol, breadth_first)
def test_kd_tree_query_radius(n_samples=100, n_features=10): np.random.seed(0) X = 2 * np.random.random(size=(n_samples, n_features)) - 1 query_pt = np.zeros(n_features, dtype=float) eps = 1E-15 # roundoff error can cause test to fail kdt = KDTree(X, leaf_size=5) rad = np.sqrt(((X - query_pt) ** 2).sum(1)) for r in np.linspace(rad[0], rad[-1], 100): ind = kdt.query_radius(query_pt, r + eps)[0] i = np.where(rad <= r + eps)[0] ind.sort() i.sort() assert_allclose(i, ind)
def test_kd_tree_query_radius(n_samples=100, n_features=10): rng = check_random_state(0) X = 2 * rng.random_sample(size=(n_samples, n_features)) - 1 query_pt = np.zeros(n_features, dtype=float) eps = 1E-15 # roundoff error can cause test to fail kdt = KDTree(X, leaf_size=5) rad = np.sqrt(((X - query_pt)**2).sum(1)) for r in np.linspace(rad[0], rad[-1], 100): ind = kdt.query_radius([query_pt], r + eps)[0] i = np.where(rad <= r + eps)[0] ind.sort() i.sort() assert_array_almost_equal(i, ind)
def generate_pairs_raw(patches, constants): """Generate raw pairs without patch normalization.""" # Convert the list of patch norms into numpy arrays patch_database = [] patch_database.append( np.vstack([np.reshape(patch.raw_patch, [-1]) for patch in patches[0]])) # Find list of just 2 nearest neighbours for each patch due to duplicate nearest = [] p1 = np.concatenate(patch_database[0:]) kdt = KDTree(p1, leaf_size=30, metric='euclidean') nn = kdt.query(patch_database[0], k=2, return_distance=False, sort_results=False) nearest.append(nn) return np.concatenate(nearest)
def test_kd_tree_query_radius(n_samples=100, n_features=10): rng = check_random_state(0) X = 2 * rng.random_sample(size=(n_samples, n_features)) - 1 query_pt = np.zeros(n_features, dtype=float) eps = 1E-15 # roundoff error can cause test to fail kdt = KDTree(X, leaf_size=5) rad = np.sqrt(((X - query_pt) ** 2).sum(1)) for r in np.linspace(rad[0], rad[-1], 100): ind = kdt.query_radius([query_pt], r + eps)[0] i = np.where(rad <= r + eps)[0] ind.sort() i.sort() assert_array_almost_equal(i, ind)
def __call__(self, x, ma): h = F.tanh(self.l0(x)) h = F.tanh(self.l1(h)) h = F.tanh(self.l2(h)) # kd_tree q_train = [] # for train [variable,variable] ind_list = [] # for train dist_list = [] # for train for j in range(len(ma.maq)): # loop n_actions h_list = ma.mah[j] lp = len(h_list) leaf_size = lp + (lp / 2) tree = KDTree(h_list, leaf_size=leaf_size) h_ = h.data if lp < 50: k = lp else: k = 50 dist, ind = tree.query(h_, k=k) mahi = ma.mah[j][ind[0]] hi = chainer.Variable(cuda.to_cpu(mahi)) tiled_h = chainer.Variable(np.tile(h.data, (len(ind[0]), 1))) wi = F.expand_dims( 1 / (F.sqrt(F.sum((tiled_h - hi) * (tiled_h - hi), axis=1) + 1e-3)), 1) w = F.sum(wi, axis=0) maqi = ma.maq[j][ind[0]] q = chainer.Variable(cuda.to_cpu(maqi)) qq = F.expand_dims(F.sum(wi * q, axis=0) / w, 1) q_train.append(qq) ind_list.append(ind) dist_list.append(dist) self.q_list[0][j] = qq.data if self.use_gpu: qa = chainer.Variable(cuda.to_cpu(self.q_list)) else: qa = self.q_list return qa, q_train, ind_list, dist_list, h.data
def test_kd_tree_query_radius_distance(n_samples=100, n_features=10): np.random.seed(0) X = 2 * np.random.random(size=(n_samples, n_features)) - 1 query_pt = np.zeros(n_features, dtype=float) eps = 1E-15 # roundoff error can cause test to fail kdt = KDTree(X, leaf_size=5) rad = np.sqrt(((X - query_pt) ** 2).sum(1)) for r in np.linspace(rad[0], rad[-1], 100): ind, dist = kdt.query_radius(query_pt, r + eps, return_distance=True) ind = ind[0] dist = dist[0] d = np.sqrt(((query_pt - X[ind]) ** 2).sum(1)) assert_allclose(d, dist)
def test_gaussian_kde(n_samples=1000): # Compare gaussian KDE results to scipy.stats.gaussian_kde from scipy.stats import gaussian_kde np.random.seed(0) x_in = np.random.normal(0, 1, n_samples) x_out = np.linspace(-5, 5, 30) for h in [0.01, 0.1, 1]: kdt = KDTree(x_in[:, None]) try: gkde = gaussian_kde(x_in, bw_method=h / np.std(x_in)) except TypeError: raise SkipTest("Old scipy, does not accept explicit bandwidth.") dens_kdt = kdt.kernel_density(x_out[:, None], h) / n_samples dens_gkde = gkde.evaluate(x_out) assert_array_almost_equal(dens_kdt, dens_gkde, decimal=3)
def test_kd_tree_query_radius_distance(n_samples=100, n_features=10): rng = check_random_state(0) X = 2 * rng.random_sample(size=(n_samples, n_features)) - 1 query_pt = np.zeros(n_features, dtype=float) eps = 1E-15 # roundoff error can cause test to fail kdt = KDTree(X, leaf_size=5) rad = np.sqrt(((X - query_pt) ** 2).sum(1)) for r in np.linspace(rad[0], rad[-1], 100): ind, dist = kdt.query_radius([query_pt], r + eps, return_distance=True) ind = ind[0] dist = dist[0] d = np.sqrt(((query_pt - X[ind]) ** 2).sum(1)) assert_array_almost_equal(d, dist)
def test_kd_tree_query_radius_distance(n_samples=100, n_features=10): np.random.seed(0) X = 2 * np.random.random(size=(n_samples, n_features)) - 1 query_pt = np.zeros(n_features, dtype=float) eps = 1e-15 # roundoff error can cause test to fail kdt = KDTree(X, leaf_size=5) rad = np.sqrt(((X - query_pt) ** 2).sum(1)) for r in np.linspace(rad[0], rad[-1], 100): ind, dist = kdt.query_radius(query_pt, r + eps, return_distance=True) ind = ind[0] dist = dist[0] d = np.sqrt(((query_pt - X[ind]) ** 2).sum(1)) assert_allclose(d, dist)
def test_gaussian_kde(n_samples=1000): """Compare gaussian KDE results to scipy.stats.gaussian_kde""" from scipy.stats import gaussian_kde np.random.seed(0) x_in = np.random.normal(0, 1, n_samples) x_out = np.linspace(-5, 5, 30) for h in [0.01, 0.1, 1]: kdt = KDTree(x_in[:, None]) try: gkde = gaussian_kde(x_in, bw_method=h / np.std(x_in)) except TypeError: raise SkipTest("Old scipy, does not accept explicit bandwidth.") dens_kdt = kdt.kernel_density(x_out[:, None], h) / n_samples dens_gkde = gkde.evaluate(x_out) assert_array_almost_equal(dens_kdt, dens_gkde, decimal=3)
def _fit(self, X): #use Euclidean metric if possible, or raise error [IY] #note that in sompy.project_realdata() the algorithm is set by default # (e.g. to 'brute' or 'kd_tree') if self.metric_params is None: self.effective_metric_params_ = {} else: self.effective_metric_params_ = self.metric_params.copy() if self.metric not in ['euclidean', 'minkowski']: raise ValueError("Using Euclidean distance with the wrong metric") self.effective_metric_ = self.metric # For minkowski distance, use more efficient methods where available if self.metric == 'minkowski': p = self.effective_metric_params_.pop('p', 2) if p == 2: self.effective_metric_ = 'euclidean' else: raise ValueError( "cannot replace Minkowski with Euclidian metric") X = check_array(X, accept_sparse='csr') n_samples = X.shape[0] if n_samples == 0: raise ValueError("n_samples must be greater than 0") if issparse(X) and self.effective_metric_ not in VALID_METRICS_SPARSE[ 'brute']: raise ValueError("metric '%s' not valid for sparse input" % self.effective_metric_) self._fit_method = self.algorithm self._fit_X = X if self._fit_method == 'ball_tree': self._tree = BallTree(X, self.leaf_size, metric=self.effective_metric_, **self.effective_metric_params_) elif self._fit_method == 'kd_tree': self._tree = KDTree(X, self.leaf_size, metric=self.effective_metric_, **self.effective_metric_params_) elif self._fit_method == 'brute': self._tree = None else: raise ValueError("algorithm = '%s' not recognized" % self.algorithm) if self.n_neighbors is not None: if self.n_neighbors <= 0: raise ValueError("Expected n_neighbors > 0. Got %d" % self.n_neighbors) return self
def write(self, h, v): keys = np.array(self.memory_keys, dtype=np.float32) values = np.array(self.memory_values, dtype=np.float32) if len(self.memory_keys) > 0: tree = KDTree(keys, leaf_size=50) distance, index = tree.query(np.array([h], dtype=np.float32)) if distance[0][0] == 0: index = index[0][0] self.memory_values[index] += self.lr * (v - self.memory_values[index]) return if len(self.memory_values) < self.capacity: self.ages[len(self.memory_values) - 1] = 0 self.memory_keys.append(h) self.memory_values.append(v) else: index = np.argmin(self.ages) self.memory_keys[index] = h self.memory_values[index] = v self.ages[index] = 0
def add(self, state, value, time): if len(self) < self.capacity: self.states.append(state) self.values.append(value) self.times.append(time) else: min_time_idx = int(np.argmin(self.times)) if time > self.times[min_time_idx]: self.replace(state, value, time, min_time_idx) self._tree = KDTree(np.array(self.states))
def test_gaussian_kde(n_samples=1000): """Compare gaussian KDE results to scipy.stats.gaussian_kde""" from scipy.stats import gaussian_kde np.random.seed(0) x_in = np.random.normal(0, 1, n_samples) x_out = np.linspace(-5, 5, 30) for h in [0.01, 0.1, 1]: kdt = KDTree(x_in[:, None]) try: gkde = gaussian_kde(x_in, bw_method=h / np.std(x_in)) except TypeError: # older versions of scipy don't accept explicit bandwidth raise SkipTest dens_kdt = kdt.kernel_density(x_out[:, None], h) / n_samples dens_gkde = gkde.evaluate(x_out) assert_allclose(dens_kdt, dens_gkde, rtol=1E-3, atol=1E-3)
def lookup(self, h): if len(self.memory_values) == 0: return np.zeros((len(h), 1, len(h[0])), dtype=np.float32), np.zeros((len(h), 1), dtype=np.float32) keys = np.array(self.memory_keys, dtype=np.float32) values = np.array(self.memory_values, dtype=np.float32) size = keys.shape[0] if size < self.p: k = size else: k = self.p queried_keys = np.zeros((len(h), k, len(h[0])), dtype=np.float32) queried_values = np.zeros((len(h), k), dtype=np.float32) for i, encoded_state in enumerate(h): tree = KDTree(keys, leaf_size=50) distances, indices = tree.query(np.array([encoded_state], dtype=np.float32), k=k) queried_keys[i] = keys[indices] queried_values[i] = values[indices][-1] self.ages += 1 self.ages[indices] = 0 return queried_keys, queried_values
def fit_predict(self, xs: np.ndarray, ys: np.ndarray = None): kd_tree = KDTree(xs, metric=self.metric, leaf_size=self.leaf_size) n_points = xs.shape[0] neighbours = kd_tree.query_radius(X=xs, r=self.eps) dsu = DisjointSetUnion(n_points) for i, neighs in enumerate(neighbours): if neighs.shape[0] < self.min_samples: continue for j in neighs: dsu.merge(i, j) if ys is None: ys = [0] * n_points current_cluster_id = 0 for i in range(n_points): if i == dsu.find(i): ys[i] = current_cluster_id current_cluster_id += 1 return [ys[dsu.find(i)] for i in range(n_points)]
def test_kd_tree_kde(kernel, h): n_samples, n_features = (100, 3) rng = check_random_state(0) X = rng.random_sample((n_samples, n_features)) Y = rng.random_sample((n_samples, n_features)) kdt = KDTree(X, leaf_size=10) dens_true = compute_kernel_slow(Y, X, kernel, h) for rtol in [0, 1E-5]: for atol in [1E-6, 1E-2]: for breadth_first in (True, False): check_results(kernel, h, atol, rtol, breadth_first, Y, kdt, dens_true)
def test_kd_tree_two_point(n_samples=100, n_features=3): np.random.seed(0) X = np.random.random((n_samples, n_features)) Y = np.random.random((n_samples, n_features)) r = np.linspace(0, 1, 10) kdt = KDTree(X, leaf_size=10) D = DistanceMetric.get_metric("euclidean").pairwise(Y, X) counts_true = [(D <= ri).sum() for ri in r] def check_two_point(r, dualtree): counts = kdt.two_point_correlation(Y, r=r, dualtree=dualtree) assert_allclose(counts, counts_true) for dualtree in (True, False): yield check_two_point, r, dualtree
def test_kd_tree_kde(n_samples=100, n_features=3): rng = check_random_state(0) X = rng.random_sample((n_samples, n_features)) Y = rng.random_sample((n_samples, n_features)) kdt = KDTree(X, leaf_size=10) for kernel in [ 'gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear', 'cosine' ]: for h in [0.01, 0.1, 1]: dens_true = compute_kernel_slow(Y, X, kernel, h) for rtol in [0, 1E-5]: for atol in [1E-6, 1E-2]: for breadth_first in (True, False): yield (check_results, kernel, h, atol, rtol, breadth_first, Y, kdt, dens_true)
def add(self, state, value, time, update_type): if len(self) < self.capacity: self.states.append(state) self.values.append(value) self.times.append(time) self.old_vals.append([value]) else: min_time_idx = int(np.argmin(self.times)) if time > self.times[min_time_idx]: if update_type == 'time average': max_var_idx = int( np.argmax(np.var(np.asarray(self.old_vals), axis=1))) self.replace(state, value, time, max_var_idx) else: self.replace(state, value, time, min_time_idx) self._tree = KDTree(np.array(self.states))
def _fit(self, X): self._check_algorithm_metric() self._check_hubness_algorithm() self._check_algorithm_hubness_compatibility() if self.metric_params is None: self.effective_metric_params_ = {} else: self.effective_metric_params_ = self.metric_params.copy() effective_p = self.effective_metric_params_.get('p', self.p) if self.metric in ['wminkowski', 'minkowski']: self.effective_metric_params_['p'] = effective_p self.effective_metric_ = self.metric # For minkowski distance, use more efficient methods where available if self.metric == 'minkowski': p = self.effective_metric_params_.pop('p', 2) if p <= 0: raise ValueError( f"p must be greater than one for minkowski metric, " f"or in ]0, 1[ for fractional norms.") elif p == 1: self.effective_metric_ = 'manhattan' elif p == 2: self.effective_metric_ = 'euclidean' elif p == np.inf: self.effective_metric_ = 'chebyshev' else: self.effective_metric_params_['p'] = p if isinstance(X, NeighborsBase): self._fit_X = X._fit_X self._tree = X._tree self._fit_method = X._fit_method self._index = X._index self._hubness_reduction = X._hubness_reduction return self elif isinstance(X, BallTree): self._fit_X = X.data self._tree = X self._fit_method = 'ball_tree' return self elif isinstance(X, KDTree): self._fit_X = X.data self._tree = X self._fit_method = 'kd_tree' return self elif isinstance(X, ApproximateNearestNeighbor): self._tree = None if isinstance(X, PuffinnLSH): self._fit_X = X.X_train_ self._fit_method = 'lsh' elif isinstance(X, FalconnLSH): self._fit_X = X.X_train_ self._fit_method = 'falconn_lsh' elif isinstance(X, ONNG): self._fit_method = 'onng' elif isinstance(X, HNSW): self._fit_method = 'hnsw' elif isinstance(X, RandomProjectionTree): self._fit_method = 'rptree' self._index = X # TODO enable hubness reduction here ... return self X = check_array(X, accept_sparse='csr') n_samples = X.shape[0] if n_samples == 0: raise ValueError( f"n_samples must be greater than 0 (but was {n_samples}.") if issparse(X): if self.algorithm not in ('auto', 'brute'): warnings.warn("cannot use tree with sparse input: " "using brute force") if self.effective_metric_ not in VALID_METRICS_SPARSE['brute'] \ and not callable(self.effective_metric_): raise ValueError( f"Metric '{self.effective_metric_}' not valid for sparse input. " f"Use sorted(sklearn.neighbors.VALID_METRICS_SPARSE['brute']) " f"to get valid options. Metric can also be a callable function." ) self._fit_X = X.copy() self._tree = None self._fit_method = 'brute' if self.hubness is not None: warnings.warn( f'cannot use hubness reduction with tree: disabling hubness reduction.' ) self.hubness = None self._hubness_reduction_method = None self._hubness_reduction = NoHubnessReduction() return self self._fit_method = self.algorithm self._fit_X = X self._hubness_reduction_method = self.hubness if self._fit_method == 'auto': # A tree approach is better for small number of neighbors, # and KDTree is generally faster when available if ((self.n_neighbors is None or self.n_neighbors < self._fit_X.shape[0] // 2) and self.metric != 'precomputed'): if self.effective_metric_ in VALID_METRICS['kd_tree']: self._fit_method = 'kd_tree' elif (callable(self.effective_metric_) or self.effective_metric_ in VALID_METRICS['ball_tree']): self._fit_method = 'ball_tree' else: self._fit_method = 'brute' else: self._fit_method = 'brute' self._index = None if self._fit_method == 'ball_tree': self._tree = BallTree(X, self.leaf_size, metric=self.effective_metric_, **self.effective_metric_params_) self._index = None elif self._fit_method == 'kd_tree': self._tree = KDTree(X, self.leaf_size, metric=self.effective_metric_, **self.effective_metric_params_) self._index = None elif self._fit_method == 'brute': self._tree = None self._index = None elif self._fit_method == 'lsh': self._index = PuffinnLSH(verbose=self.verbose, **self.algorithm_params) self._index.fit(X) self._tree = None elif self._fit_method == 'falconn_lsh': self._index = FalconnLSH(verbose=self.verbose, **self.algorithm_params) self._index.fit(X) self._tree = None elif self._fit_method == 'onng': self._index = ONNG(verbose=self.verbose, **self.algorithm_params) self._index.fit(X) self._tree = None elif self._fit_method == 'hnsw': self._index = HNSW(verbose=self.verbose, **self.algorithm_params) self._index.fit(X) self._tree = None elif self._fit_method == 'rptree': self._index = RandomProjectionTree(verbose=self.verbose, **self.algorithm_params) self._index.fit(X) self._tree = None # because it's a tree, but not an sklearn tree... else: raise ValueError(f"algorithm = '{self.algorithm}' not recognized") if self._hubness_reduction_method is None: self._hubness_reduction = NoHubnessReduction() else: n_candidates = self.algorithm_params['n_candidates'] if 'include_self' in self.kwargs and self.kwargs['include_self']: neigh_train = self.kcandidates(X, n_neighbors=n_candidates, return_distance=True) else: neigh_train = self.kcandidates(n_neighbors=n_candidates, return_distance=True) # Remove self distances neigh_dist_train = neigh_train[0] # [:, 1:] neigh_ind_train = neigh_train[1] # [:, 1:] if self._hubness_reduction_method == 'ls': self._hubness_reduction = LocalScaling(verbose=self.verbose, **self.hubness_params) elif self._hubness_reduction_method == 'mp': self._hubness_reduction = MutualProximity( verbose=self.verbose, **self.hubness_params) elif self._hubness_reduction_method == 'dsl': self._hubness_reduction = DisSimLocal(verbose=self.verbose, **self.hubness_params) elif self._hubness_reduction_method == 'snn': raise NotImplementedError('feature not yet implemented') elif self._hubness_reduction_method == 'simhubin': raise NotImplementedError('feature not yet implemented') else: raise ValueError( f'Hubness reduction algorithm = "{self._hubness_reduction_method}" not recognized.' ) self._hubness_reduction.fit(neigh_dist_train, neigh_ind_train, X=X, assume_sorted=False) if self.n_neighbors is not None: if self.n_neighbors <= 0: raise ValueError( f"Expected n_neighbors > 0. Got {self.n_neighbors:d}") else: if not np.issubdtype(type(self.n_neighbors), np.integer): raise TypeError( f"n_neighbors does not take {type(self.n_neighbors)} value, " f"enter integer value") return self
def compute_average_scores(pdb_path, cat, it, bu): files = glob("%s*_%s_%s.pdb" % (pdb_path, it, bu)) for pdb_filename in sorted(files) : pdb_id = basename(pdb_filename)[:-4] pdb_patch_coord = ("%s%s_patch_coord.txt" % (pdb_path, pdb_id)) pdb_patch_score = ("%s%s_patch_score.txt" % (pdb_path, pdb_id)) with open(pdb_patch_coord) as coord, open(pdb_patch_score) as score: patch_coord = [[float(x) for x in a.split()] for a in coord.readlines()] patch_score = [float(x) - threshold[(cat, it, bu)] for x in score.readlines()] min_v = min(patch_score) max_v = max(patch_score) patch_score_scaled = [(lambda x: -(x / min_v) if x < 0 else (x / max_v))(x) for x in patch_score] X = np.array([a[0] for a in zip(patch_coord, patch_score_scaled) if a[1] >= 0]) X_weights = np.array([x for x in patch_score_scaled if x >= 0]) pdb_structure = p.get_structure(pdb_id, pdb_filename) atoms = np.array([atm.get_coord() for atm in pdb_structure.get_atoms() if not isHydrogen(atm) and not isHETATM(atm)]) atoms_tree = KDTree(atoms) residues_coord = {} for residue in pdb_structure.get_residues() : for atm in residue : residues_coord[tuple(atm.get_coord())] = residue average_residues_scores = {residue : 0 for residue in pdb_structure.get_residues()} # since the isollation forest algorithm is random, we run it several times to assess the average performance of the method if outlier_fraction[(cat, it, bu)] : reps = n_iterations else : reps = 1 for iteration in xrange(reps) : print "Running iteration %d of %d" % (iteration + 1, reps) if outlier_fraction[(cat, it, bu)] : forest = IsolationForest(contamination=outlier_fraction[(cat, it, bu)], n_jobs=-1) forest.fit(X, sample_weight=X_weights) prediction_isolation_forest = forest.predict(patch_coord) patch_pred_no_outliers = [copysign(1, x) for x in prediction_isolation_forest] else : patch_pred_no_outliers = [copysign(1, x) for x in patch_score] # here we map the patch predictions on the underlying residues for i in xrange(len(patch_coord)) : # for each patch # if it was predicted as non-interface continue to the next if patch_pred_no_outliers[i] < 0 : continue # multiple residues can be underneath a given patch, we do not want to consider the same residue more than once marked_residues = set() # get all atoms within mapping_distance from the given patch center indexes = atoms_tree.query_radius([patch_coord[i]], r=mapping_distance, count_only = False, return_distance=True, sort_results = True) for ind in zip(indexes[0][0], indexes[1][0]) : # which residue does the current atom belong to? current_res = residues_coord[tuple(atoms[ind[0]])] # if already considered continue to the next if current_res in marked_residues : continue # increase the score of the current residue average_residues_scores[current_res] += 1 / (1.0 + ind[1]) # patch_pred_no_outliers[i] / (1.0 + ind[1]) # mark as seen for the current patch marked_residues.add(current_res) average_residues_scores.update((x, y / reps) for x, y in average_residues_scores.items()) residues_with_scores = [(lambda x, y : (x[2], str(x[3][1]) + x[3][2], y))(residue.get_full_id(), score) for residue, score in average_residues_scores.items()] residues_with_scores.sort(key=lambda x : x[1]) residues_with_scores.sort(key=lambda x : x[0]) prediction_path = pdb_path + "our_prediction/" if not path.exists(prediction_path) : makedirs(prediction_path) print pdb_id with open("%s%s_residue_scores.txt" % (prediction_path, pdb_id), "wb") as output_residue_scores : for r in residues_with_scores : output_residue_scores.write("%s;%s;%f\n" %(r[0], r[1], r[2]))
import cv2 import pickle from sklearn.neighbors.kd_tree import KDTree import numpy as np from bagofvisualwords import BagOfVisualWords from VLADlib.VLAD import * from VLADlib.Descriptors import * pathVD = "visualWords/visualWords.pickle" with open(pathVD, 'rb') as f: vocab = pickle.load(f) training = np.asarray([i.toarray()[0].tolist() for i in vocab]) tree = KDTree(training, leaf_size=2) image = 'dataset/3.jpg' im = cv2.imread(image) # initial BoW pathVD = 'visualDictionary/visualDictionary2ORB.pickle' with open(pathVD, 'rb') as g: visualDictionary = pickle.load(g) bovw = BagOfVisualWords(visualDictionary.cluster_centers_) #compute descriptors kp, descriptor = describeORB(im) # represent at BoW
#from sklearn.neighbors import KNeighborsClassifier from sklearn.neighbors import NearestNeighbors from sklearn.neighbors.kd_tree import KDTree #from sklearn.neighbors import DistanceMetric import numpy as np import get_data2 as gd headers = gd.get_headers() dicts = gd.get_data_list_of_dicts() rows_lol = [] for i in range(len(gd.get_data_slice(headers[0], dicts))): rows_lol.append([]) for i in range(len(headers)): if i ==1 or i==4: column = gd.get_data_slice_numbers(headers[i], dicts) else: column = gd.get_data_slice_numbers(headers[i], dicts) for j in range(len(gd.get_data_slice(headers[0], dicts))): rows_lol[j].append(column[j]) X = np.array(rows_lol) #nbrs = NearestNeighbors(n_neighbors=5, algorithm ='kd_tree', metric ='jaccard').fit(X) kdt = KDTree(X, leaf_size=30, metric='euclidean') kdt.query(X, k=3, return_distance=False)
def _fit(self, X): self._check_algorithm_metric() self._check_hubness_algorithm() self._check_algorithm_hubness_compatibility() if self.metric_params is None: self.effective_metric_params_ = {} else: self.effective_metric_params_ = self.metric_params.copy() effective_p = self.effective_metric_params_.get('p', self.p) if self.metric in ['wminkowski', 'minkowski']: self.effective_metric_params_['p'] = effective_p self.effective_metric_ = self.metric # For minkowski distance, use more efficient methods where available if self.metric == 'minkowski': p = self.effective_metric_params_.pop('p', 2) if p <= 0: raise ValueError(f"p must be greater than one for minkowski metric, " f"or in ]0, 1[ for fractional norms.") elif p == 1: self.effective_metric_ = 'manhattan' elif p == 2: self.effective_metric_ = 'euclidean' elif p == np.inf: self.effective_metric_ = 'chebyshev' else: self.effective_metric_params_['p'] = p if isinstance(X, NeighborsBase): self._fit_X = X._fit_X self._tree = X._tree self._fit_method = X._fit_method self._index = X._index self._hubness_reduction = X._hubness_reduction return self elif isinstance(X, BallTree): self._fit_X = X.data self._tree = X self._fit_method = 'ball_tree' return self elif isinstance(X, KDTree): self._fit_X = X.data self._tree = X self._fit_method = 'kd_tree' return self elif isinstance(X, ApproximateNearestNeighbor): self._tree = None if isinstance(X, PuffinnLSH): self._fit_X = np.array([X.index_.get(i) for i in range(X.n_indexed_)]) * X.X_indexed_norm_ self._fit_method = 'lsh' elif isinstance(X, FalconnLSH): self._fit_X = X.X_train_ self._fit_method = 'falconn_lsh' elif isinstance(X, NNG): self._fit_X = None self._fit_method = 'nng' elif isinstance(X, HNSW): self._fit_X = None self._fit_method = 'hnsw' elif isinstance(X, RandomProjectionTree): self._fit_X = None self._fit_method = 'rptree' self._index = X # TODO enable hubness reduction here. # We do not store X_train in all cases atm. # self._hubness_reduction_method = self.hubness # self._set_hubness_reduction(self._fit_X) return self X = check_array(X, accept_sparse='csr') n_samples = X.shape[0] if n_samples == 0: raise ValueError(f"n_samples must be greater than 0 (but was {n_samples}.") if issparse(X): if self.algorithm not in ('auto', 'brute'): warnings.warn("cannot use tree with sparse input: " "using brute force") if self.effective_metric_ not in VALID_METRICS_SPARSE['brute'] \ and not callable(self.effective_metric_): raise ValueError(f"Metric '{self.effective_metric_}' not valid for sparse input. " f"Use sorted(sklearn.neighbors.VALID_METRICS_SPARSE['brute']) " f"to get valid options. Metric can also be a callable function.") self._fit_X = X.copy() self._tree = None self._fit_method = 'brute' if self.hubness is not None: warnings.warn(f'cannot use hubness reduction with sparse data: disabling hubness reduction.') self.hubness = None self._hubness_reduction_method = None self._hubness_reduction = NoHubnessReduction() return self self._fit_method = self.algorithm self._fit_X = X self._hubness_reduction_method = self.hubness if self._fit_method == 'auto': # A tree approach is better for small number of neighbors, # and KDTree is generally faster when available if ((self.n_neighbors is None or self.n_neighbors < self._fit_X.shape[0] // 2) and self.metric != 'precomputed'): if self.effective_metric_ in VALID_METRICS['kd_tree']: self._fit_method = 'kd_tree' elif (callable(self.effective_metric_) or self.effective_metric_ in VALID_METRICS['ball_tree']): self._fit_method = 'ball_tree' else: self._fit_method = 'brute' else: self._fit_method = 'brute' self._index = None if self._fit_method == 'ball_tree': self._tree = BallTree(X, self.leaf_size, metric=self.effective_metric_, **self.effective_metric_params_) self._index = None elif self._fit_method == 'kd_tree': self._tree = KDTree(X, self.leaf_size, metric=self.effective_metric_, **self.effective_metric_params_) self._index = None elif self._fit_method == 'brute': self._tree = None self._index = None elif self._fit_method == 'lsh': self._index = PuffinnLSH(**self.algorithm_params) self._index.fit(X) self._tree = None elif self._fit_method == 'falconn_lsh': self._index = FalconnLSH(**self.algorithm_params) self._index.fit(X) self._tree = None elif self._fit_method == 'nng': self._index = NNG(**self.algorithm_params) self._index.fit(X) self._tree = None elif self._fit_method == 'hnsw': self._index = HNSW(**self.algorithm_params) self._index.fit(X) self._tree = None elif self._fit_method == 'rptree': self._index = RandomProjectionTree(**self.algorithm_params) self._index.fit(X) self._tree = None # because it's a tree, but not an sklearn tree... else: raise ValueError(f"algorithm = '{self.algorithm}' not recognized") # Fit hubness reduction method self._set_hubness_reduction(X) if self.n_neighbors is not None: if self.n_neighbors <= 0: raise ValueError(f"Expected n_neighbors > 0. Got {self.n_neighbors:d}") else: if not np.issubdtype(type(self.n_neighbors), np.integer): raise TypeError( f"n_neighbors does not take {type(self.n_neighbors)} value, " f"enter integer value" ) return self