def knn_cond_mutual_information(x, y, z, k, standardize = True, dualtree = False): """ Computes conditional mutual information between two time series x and y conditioned on a third z (which can be multi-dimensional) as I(x; y | z) = sum( p(x,y,z) * log( p(z)*p(x,y,z) / p(x,z)*p(y,z) ), where p(z), p(x,z), p(y,z) and p(x,y,z) are probability distributions. Performs k-nearest neighbours search using k-dimensional tree. Uses sklearn.neighbors for KDTree class. standardize - whether transform data to zero mean and unit variance dualtree - whether to use dualtree formalism in k-d tree for the k-NN search could lead to better performance with large N According to Frenzel S. and Pompe B., Phys. Rev. Lett., 99, 2007. """ from sklearn.neighbors import KDTree # prepare data if standardize: x = _center_ts(x) y = _center_ts(y) if isinstance(z, np.ndarray): z = _center_ts(z) elif isinstance(z, list): for cond_ts in z: cond_ts = _center_ts(cond_ts) z = np.atleast_2d(z) data = np.vstack([x, y, z]).T # build k-d tree using the maximum (Chebyshev) norm tree = KDTree(data, leaf_size = 15, metric = "chebyshev") # find distance to k-nearest neighbour per point dist, _ = tree.query(data, k = k + 1, return_distance = True, dualtree = dualtree) sum_ = 0 # prepare marginal vectors xz, yz and z n_x_z_data = np.delete(data, 1, axis = 1) n_y_z_data = np.delete(data, 0, axis = 1) n_z_data = np.delete(data, [0, 1], axis = 1) # build and query k-d trees in marginal spaces for number of points in a given dist from a point tree_x_z = KDTree(n_x_z_data, leaf_size = 15, metric = "chebyshev") n_x_z = tree_x_z.query_radius(n_x_z_data, r = dist[:, -1], count_only = True) - 2 tree_y_z = KDTree(n_y_z_data, leaf_size = 15, metric = "chebyshev") n_y_z = tree_y_z.query_radius(n_y_z_data, r = dist[:, -1], count_only = True) - 2 tree_z = KDTree(n_z_data, leaf_size = 15, metric = "chebyshev") n_z = tree_z.query_radius(n_z_data, r = dist[:, -1], count_only = True) - 2 # count points for n in range(data.shape[0]): sum_ += _neg_harmonic(n_x_z[n]) + _neg_harmonic(n_y_z[n]) - _neg_harmonic(n_z[n]) sum_ /= data.shape[0] return sum_ - _neg_harmonic(k-1)
def match_bright(x,y,x2,y2,mags,dist=1./3600.): """Routine that matches the truth catalog with the input table Args: ---- x: `float` RA of the truth objects to match (in degrees) y: `float` dec of the truth objects to match (in degrees) x2: `float` RA of detected objects to match (in degrees) y2: `float` dec of detected objects to match (in degrees) mags: `float` array containing the true input magnitudes dist: `float` maximum distance in degrees considered to match the objects, the default is 1 arcsecond. Returns: ------- brightest_ind: `int` array of indices to select the truth objects that match the detected objects, returns -1 if no match has been found for a particular object """ X = np.zeros((len(x),2)) X[:,0]=x X[:,1]=y Y = np.zeros((len(x2),2)) Y[:,0]=x2 Y[:,1]=y2 tree = KDTree(X,leaf_size=40) ind = tree.query_radius(Y, r=dist) brightest_indices = np.zeros(len(ind),dtype=np.int64) for i,ii in enumerate(ind): sorted_indices = np.argsort(mags[ii]) if(len(sorted_indices)>0): brightest_indices[i] = ii[sorted_indices[0]] else: brightest_indices[i]=-1 return brightest_indices
def concat_features_by_neighbors(df_labels, df_features, X_names=['Offense Type'], grid=["Latitude", "Longitude"], radius=1./500., scale=np.array([1.,1.])): df_labels = df_labels.dropna(subset=grid) df_features = df_features.dropna(subset=grid) X = df_features.as_matrix(X_names) xy_features = df_features.as_matrix(grid) xy_labels = df_labels.as_matrix(grid) tree = KDTree(xy_features*scale) vocabulary = set() features = [] for nei in tree.query_radius(xy_labels*scale, radius): U,I = np.unique(X[nei], return_inverse=True) D = dict(zip(U,np.bincount(I))) map(vocabulary.add, D) features.append(D) return pd.concat([df_labels, pd.DataFrame([map(fi.get, vocabulary) for fi in features], index=df_labels.index, columns=vocabulary).fillna(0.)], axis=1)
def study_redmapper_lrg_3d(hemi='north'): # create 3d grid object grid = grid3d(hemi=hemi) # load SDSS data sdss = load_sdss_data_both_catalogs(hemi) # load redmapper catalog rm = load_redmapper(hemi=hemi) # get XYZ positions (Mpc) of both datasets x_sdss, y_sdss, z_sdss = grid.xyz_from_radecz(sdss['ra'], sdss['dec'], sdss['z'], applyzcut=False) x_rm, y_rm, z_rm = grid.xyz_from_radecz(rm['ra'], rm['dec'], rm['z_spec'], applyzcut=False) pos_sdss = np.vstack([x_sdss, y_sdss, z_sdss]).T pos_rm = np.vstack([x_rm, y_rm, z_rm]).T # build a couple of KDTree's, one for SDSS, one for RM. from sklearn.neighbors import KDTree tree_sdss = KDTree(pos_sdss, leaf_size=30) tree_rm = KDTree(pos_rm, leaf_size=30) lrg_counts = tree_sdss.query_radius(pos_rm, 100., count_only=True) pl.clf() pl.hist(lrg_counts, bins=50) ipdb.set_trace()
def count_close(x,y,x2,y2,distances): """Routine that counts the number of objects that are within certain radius Args: ---- x: `float` position X of objects to count y: `float` position Y of objects to count x2: `float` position X of the objects that serve as the center of the circle where we look for neighbors y2: `float` position Y of the objects that serve as the center of the circle where we look for neighbors distances: `float` array of radii where to count the objects Returns: ------- neighbors: `float` the mean number of neighbors in a circle of radii corresponding to each entry of distances err: `float` standard deviation of the number of neighbors in a circle of radii corresponding to each entry of distances """ X = np.zeros((len(x),2)) X[:,0]=x X[:,1]=y Y = np.zeros((len(x2),2)) Y[:,0]=x2 Y[:,1]=y2 tree = KDTree(X,leaf_size=40) neighbors = np.zeros(len(distances)) err = np.zeros(len(distances)) for i,distance in enumerate(distances): neighbors[i], err[i] = np.nanmean(tree.query_radius(Y, r=distance, count_only=True)), np.nanstd(tree.query_radius(Y, r=distance, count_only=True)) return neighbors, err
def approximate_surface_node(atlas, node_id): roi = np.zeros(atlas.shape) roi[atlas == node_id] = 1 ind = np.argwhere(roi > 0) tree = KDTree(ind) count = np.sum(7 - tree.query_radius(ind, r=1.0, count_only=True)) return count
def constructQueryDict(df_centroids, filename): tree = KDTree(df_centroids[['posX', 'posY']]) ind_nn = tree.query_radius(df_centroids[['posX', 'posY']], r=10) ind_r = tree.query_radius(df_centroids[['posX', 'posY']], r=50) queries = {} for i in range(len(ind_nn)): query = df_centroids.iloc[i]["filename"] positives = np.setdiff1d(ind_nn[i], [i]).tolist() negatives = np.setdiff1d(df_centroids.index.values.tolist(), ind_r[i]).tolist() random.shuffle(positives) random.shuffle(negatives) queries[i] = {"query": query, "positives": positives, "negatives": negatives} with open(filename, 'wb') as handle: print(queries) pickle.dump(queries, handle, protocol=pickle.HIGHEST_PROTOCOL) print("Construct Training Baseline Done: " + filename + "!")
def approx_sw2_vr_from_local_bases(pointcloud, bases, max_eps) : cocycle = [] n = len(pointcloud) d = len(bases[0]) #print("dimension is " + str(d)) kd_tree = KDTree(pointcloud, leaf_size=2) close_neighbors = kd_tree.query_radius(pointcloud, r = max_eps) # force j to be larger than i close_neighbors = [ np.array([j for j in close_neighbors[i] if j > i and np.linalg.norm(pointcloud[i] - pointcloud[j]) < max_eps]) for i in range(n) ] close_neighbors = np.array(close_neighbors) pin_lifts = dict([]) for i in range(n) : for j in close_neighbors[i] : if (i,j) in pin_lifts: ij_pin = pin_lifts[(i,j)] else : ij_omega = best_orth_trans(bases[i], bases[j]) ij_pin = lift_to_pin(d,ij_omega) pin_lifts[(i,j)] = ij_pin for k in close_neighbors[j] : ik_close = np.linalg.norm(pointcloud[i] - pointcloud[k]) < max_eps if ik_close : if (j,k) in pin_lifts: jk_pin = pin_lifts[(j,k)] else : jk_omega = best_orth_trans(bases[j], bases[k]) jk_pin = lift_to_pin(d,jk_omega) pin_lifts[(j,k)] = jk_pin if (i,k) in pin_lifts: ik_pin = pin_lifts[(i,k)] else : ik_omega = best_orth_trans(bases[i], bases[k]) ik_pin = lift_to_pin(d,ik_omega) pin_lifts[(i,k)] = ik_pin ki_pin = invert_pin(d, ik_pin) simplex_approx_val = mults(d,vects_to_cliff(d,ij_pin + jk_pin + ki_pin))[0] simplex_val = simplex_approx_val < 0 if simplex_val != 0 : cocycle.append([i,j,k, simplex_val]) cocycle = np.array(cocycle) return np.array(cocycle)
def overlapped_points(Truth, Predictions, radius): tree = KDTree(Truth, leaf_size = 2*len(Predictions)) output = [] for point in Predictions: point_copy = point[np.newaxis, :] ind = tree.query_radius(point_copy, r = 15) if ind[0].shape[0] != 0: output.append(point) return output, len(output)
def construct_query_dict(df_centroids, filename): tree = KDTree(df_centroids[['northing','easting']]) ind_nn = tree.query_radius(df_centroids[['northing','easting']],r=10) ind_r = tree.query_radius(df_centroids[['northing','easting']], r=50) queries = {} for i in range(len(ind_nn)): query = df_centroids.iloc[i]["file"] positives = np.setdiff1d(ind_nn[i],[i]).tolist() negatives = np.setdiff1d( df_centroids.index.values.tolist(),ind_r[i]).tolist() random.shuffle(negatives) queries[i] = {"query":query, "positives":positives,"negatives":negatives} with open(filename, 'wb') as handle: pickle.dump(queries, handle, protocol=pickle.HIGHEST_PROTOCOL) print("Done ", filename)
def euclidean_analysis_rank_pre(root_path, dataset): dl = pickle.load(open(root_path + 'dl_' + dataset + '.pk', 'rb')) K = min(5000, dl.nv) K_bin = 10 dl.show_info() coor_nor = [dl.vid_coor_nor[vid] for vid in range(dl.nv)] tree = KDTree(coor_nor) time_rank_pre = np.zeros((12, K / K_bin)) rank_pre = np.zeros(K / K_bin) time_cnt = np.zeros(12) for uid, records_u in dl.uid_records.items(): vid_cnt = {} # all visable records_u.summarize() for record in records_u.get_records(0): if record.vid not in vid_cnt: vid_cnt[record.vid] = 0 vid_cnt[record.vid] += 1 # print 'vid_cnt: ', vid_cnt records_al_test = records_u.get_records(1) for rid, record in enumerate(records_al_test): if record.is_last: continue if record.vid not in vid_cnt: vid_cnt[record.vid] = 0 vid_cnt[record.vid] += 1 time_gap = int((records_al_test[rid + 1].dt - records_al_test[rid].dt).total_seconds() / 60 / 30) if time_gap == 12: record.peek() # records_u.records[rid + 1].peek() # print (records_u.records[rid + 1].dt - records_u.records[rid].dt).total_seconds() / 60 / 30 # raw_input() time_gap = 11 time_cnt[time_gap] += 1 dist = np.sqrt( np.sum((dl.vid_coor_nor[record.vid] - dl.vid_coor_nor[record.vid_next])**2)) ids = tree.query_radius([dl.vid_coor_nor[record.vid]], r=dist) idx = len(ids[0]) / K_bin if idx >= K / K_bin: idx = K / K_bin - 1 time_rank_pre[time_gap, idx] += 1 rank_pre[idx] += 1 for i in xrange(0, len(time_cnt)): time_rank_pre[i] /= time_cnt[i] rank_pre /= np.sum(rank_pre) plt.imshow(time_rank_pre, cmap='hot', interpolation='nearest') plt.show() for i in xrange(0, len(time_cnt)): for j in xrange(1, len(time_rank_pre[i])): time_rank_pre[i, j] += time_rank_pre[i, j - 1] print i, time_rank_pre[i] print 0, rank_pre[0] for j in xrange(1, len(rank_pre)): rank_pre[j] += rank_pre[j - 1] print j * 100, rank_pre[j]
def spatial_neighbours(coords, n_sp_neighbors=7, radius=None, include_source_location=True, sample_id=None): """ Find spatial neighbours using the number of neighbours or radius (KDTree approach). :param coords: numpy.ndarray with x,y positions of spots. :param n_sp_neighbors: how many spatially-adjacent neighbors to report for each spot (including the source spot). Use 7 for hexagonal grid. :param radius: Supersedes `n_sp_neighbors` - radius within which to report spatially-adjacent neighbors for each spot. Pick radius based on spot size. :param include_source_location: include the observation itself into the list of neighbours. :param sample_id: pd.Series or np.array listing sample membership for each observation (each row of coords). """ # create and query spatial proximity tree within each sample if radius is None: if include_source_location: coord_ind = np.zeros((coords.shape[0], n_sp_neighbors)) else: coord_ind = np.zeros((coords.shape[0], n_sp_neighbors - 1)) else: coord_ind = np.zeros(coords.shape[0]) if sample_id is None: sample_id = np.array(["sample" for i in range(coords.shape[0])]) total_ind = np.arange(0, coords.shape[0]).astype(int) for sam in np.unique(sample_id): sam_ind = np.isin(sample_id, [sam]) coord_tree = KDTree(coords[sam_ind, :]) if radius is None: n_list = coord_tree.query(coords[sam_ind, :], k=n_sp_neighbors, return_distance=False) n_list = np.array(n_list) # replace sample-specific indices with a global index for c in range(n_list.shape[1]): n_list[:, c] = total_ind[sam_ind][n_list[:, c]] if include_source_location: coord_ind[sam_ind, :] = n_list else: n_list_sel = n_list != np.arange(sam_ind.sum()).reshape( sam_ind.sum(), 1) coord_ind[sam_ind, :] = n_list[n_list_sel].reshape( (sam_ind.sum(), n_sp_neighbors - 1)) else: coord_ind[sam_ind] = coord_tree.query_radius(coords[sam_ind, :], radius, count_only=False) return coord_ind.astype(int)
def impute_work_locations_same_zone(hts_trips, df_ag, df_candidates, df_travel, name): hts_work = hts_trips.copy() hist_cp, bins_cp = np.histogram(hts_work["crowfly_distance"], weights=hts_work["weight"], bins=500) df_trips = df_travel.copy() df_agents = df_ag.copy() df_agents_cp = df_agents #[np.isin(df_agents["hts_person_id"], cp_ids)] home_coordinates_cp = list( zip(df_agents_cp["home_x"], df_agents_cp["home_y"])) work_coordinates = np.array( list(zip(df_candidates["x"], df_candidates["y"]))) bin_midpoints = bins_cp[:-1] + np.diff(bins_cp) / 2 cdf = np.cumsum(hist_cp) cdf = cdf / cdf[-1] values = np.random.rand(len(df_agents_cp)) value_bins = np.searchsorted(cdf, values) random_from_cdf_cp = bin_midpoints[value_bins] # in meters tree = KDTree(work_coordinates) indices_cp, distances_cp = tree.query_radius(home_coordinates_cp, r=random_from_cdf_cp, return_distance=True, sort_results=True) # In some cases no work facility was found within the given radius. In this case select the nearest facility. for i in range(len(indices_cp)): l = indices_cp[i] if len(l) == 0: dist, ind = tree.query(np.array(home_coordinates_cp[i]).reshape( 1, -1), 2, return_distance=True, sort_results=True) fac = ind[0][1] indices_cp[i] = [fac] distances_cp[i] = [dist[0][1]] indices_cp = [l[-1] for l in indices_cp] distances_cp = [d[-1] for d in distances_cp] df_return_cp = df_agents_cp.copy() df_return_cp["x"] = df_candidates.iloc[indices_cp]["x"].values df_return_cp["y"] = df_candidates.iloc[indices_cp]["y"].values df_return_cp["location_id"] = df_candidates.iloc[indices_cp][ "location_id"].values df_return = df_return_cp assert len(df_return) == len(df_agents) return df_return
def test_random_cpu(self): a = torch.randn(100, 3).to(torch.float) b = torch.randn(50, 3).to(torch.float) batch_a = torch.tensor([0 for i in range(a.shape[0] // 2)] + [1 for i in range(a.shape[0] // 2, a.shape[0])]) batch_b = torch.tensor([0 for i in range(b.shape[0] // 2)] + [1 for i in range(b.shape[0] // 2, b.shape[0])]) R = 1 idx, dist = ball_query(R, 15, a, b, mode="PARTIAL_DENSE", batch_x=batch_a, batch_y=batch_b, sort=True) idx1, dist = ball_query(R, 15, a, b, mode="PARTIAL_DENSE", batch_x=batch_a, batch_y=batch_b, sort=True) torch.testing.assert_allclose(idx1, idx) with self.assertRaises(AssertionError): idx, dist = ball_query(R, 15, a, b, mode="PARTIAL_DENSE", batch_x=batch_a, batch_y=batch_b, sort=False) idx1, dist = ball_query(R, 15, a, b, mode="PARTIAL_DENSE", batch_x=batch_a, batch_y=batch_b, sort=False) torch.testing.assert_allclose(idx1, idx) self.assertEqual(idx.shape[0], b.shape[0]) self.assertEqual(dist.shape[0], b.shape[0]) self.assertLessEqual(idx.max().item(), len(batch_a)) # Comparison to see if we have the same result tree = KDTree(a.detach().numpy()) idx3_sk = tree.query_radius(b.detach().numpy(), r=R) i = np.random.randint(len(batch_b)) for p in idx[i].detach().numpy(): if p >= 0 and p < len(batch_a): assert p in idx3_sk[i]
class Sampler: ''' Sample points in free space at a certain height ''' def __init__(self, data, safety_distance, zmin = 10, zmax = 20): ''' Parameters: data - ''' self._polygons, self._heights = np.transpose(extract_polygons(data, safety_distance)) self._xmin = np.min(data[:, 0] - data[:, 3]) self._xmax = np.max(data[:, 0] + data[:, 3]) self._ymin = np.min(data[:, 1] - data[:, 4]) self._ymax = np.max(data[:, 1] + data[:, 4]) self._zmin = zmin # limit z-axis self._zmax = zmax # Record maximum polygon dimension in the xy plane # multiply by 2 since given sizes are half widths # This is still rather clunky but will allow us to # cut down the number of polygons we compare with by a lot. self._max_poly_xy = 2 * np.max((data[:, 3], data[:, 4])) + 2 * safety_distance centers = np.array([(p.centroid.x, p.centroid.y) for p in self._polygons]).reshape(-1, 2) self._tree = KDTree(centers, metric='euclidean') def sample(self, num_samples): """Implemented with a k-d tree for efficiency.""" xvals = np.random.uniform(self._xmin, self._xmax, num_samples) yvals = np.random.uniform(self._ymin, self._ymax, num_samples) zvals = np.random.uniform(self._zmin, self._zmax, num_samples) samples = list(zip(xvals, yvals, zvals)) pts = [] for s in samples: in_collision = False idxs = list(self._tree.query_radius(np.array([s[0], s[1]]).reshape(1, -1), r=self._max_poly_xy)[0]) if len(idxs) > 0: for ind in idxs: p = self._polygons[int(ind)] h = self._heights[int(ind)] if not (p.contains(Point(s)) and h >= s[2]): in_collision = True if not in_collision: pts.append(s) return pts @property def polygons(self): return self._polygons @property def heights(self): return self._heights
def create_density_plot(X, Y, embedding): Z = np.zeros_like(X) tree = KDTree(embedding[:, :2]) for i in range(X.shape[0]): for j in range(X.shape[1]): nearby_points = embedding[tree.query_radius([[X[i, j], Y[i, j]]], r=2)[0]] Z[i, j] = eval_density_at_point(np.array([X[i, j], Y[i, j]]), nearby_points) return Z / Z.sum()
def get_direct_neighbors(site, radius): """ return direct neighbors inside a radius :param site: :return: """ coords = np.load(wind_data_path + '/Coords.npy') tree = KDTree(coords, leaf_size=1) neigh = tree.query_radius(coords[site, :].reshape(1, -1), r=radius, count_only=False, return_distance=False)[0] return neigh
def eps_neighbor_count(X, X_test, h): #print('h_opt',h_opt) #kde =KernelDensity(kernel='tophat', bandwidth=h).fit(X) tree = KDTree(X) count = tree.query_radius(X_test, r=h, count_only=True) #dnsty = kde.score_samples(X_test) #print('dnsty', np.exp(dnsty) ) #print('density, done') return count
def estimator1(x, y, k): """ Estimator 1 of Estimating mutual information, A. Kraskov et al., Physical Review E 69, 2004. x, y: Arrays of shape (N, Dx) and (N, Dy), where N is the number of samples and Dx, Dy are the dimensions of the random variables X and Y. k: k for the k-th nearest neighbors returns the estimate for the mutual information """ z = np.concatenate([x, y], axis=-1) # we use chebyshev/maximum metric, since it is the # easiest to fulfill eq. 6 of the paper in all dimensions tree = KDTree(z, metric="chebyshev") # We need to add 1, since query returns the identity dist, ind = tree.query(z, k + 1) dist = dist[:, -1] tree_x = KDTree(x, metric="chebyshev") tree_y = KDTree(y, metric="chebyshev") # query radius with count_only=True returns one count too much # for one of the subspaces, since it is not using strictly less indx, distx = tree_x.query_radius(x, dist, return_distance=True, count_only=False) indy, disty = tree_y.query_radius(y, dist, return_distance=True, count_only=False) distxy = [distx, disty] counts = np.empty((x.shape[0], 2)) for i in range(x.shape[0]): for j in range(2): tmp_dist = distxy[j][i] less = tmp_dist < dist[i] counts[i, j] = less.sum() # we do not need to add 1, since query_radius allready counted # the point itself. So counts[:, i] is allready one too much. digamma_x_mean = digamma(counts[:, 0]).mean() digamma_y_mean = digamma(counts[:, 1]).mean() return digamma(k) + digamma(x.shape[0]) - digamma_x_mean - digamma_y_mean
class Sampler: def __init__(self, data): self._polygons = extract_polygons(data) self._xmin = np.min(data[:, 0] - data[:, 3]) self._xmax = np.max(data[:, 0] + data[:, 3]) self._ymin = np.min(data[:, 1] - data[:, 4]) self._ymax = np.max(data[:, 1] + data[:, 4]) self._zmin = 20 # limit z-axis self._zmax = 30 print("Extract Polygons..") self._max_poly_xy = 2 * np.max((data[:, 3], data[:, 4])) print("Extract Polygons..") print(len(self._polygons)) centers = [] for p in tqdm(self._polygons): centers.append(p.center) centers = np.array(centers) print("Extract Polygons..") self._tree = KDTree(centers, metric='euclidean') print("Sampler Initialized..") def sample(self, num_samples): """Implemented with a k-d tree for efficiency.""" xvals = np.random.randint(int(self._xmin), int(self._xmax), size=num_samples) yvals = np.random.randint(int(self._ymin), int(self._ymax), size=num_samples) zvals = np.random.randint(self._zmin, self._zmax, size=num_samples) samples = list(zip(xvals, yvals, zvals)) pts = [] for s in samples: in_collision = False idxs = list( self._tree.query_radius(np.array([s[0], s[1]]).reshape(1, -1), r=self._max_poly_xy)[0]) if len(idxs) > 0: for ind in idxs: p = self._polygons[int(ind)] if p.contains(s) and p.height >= s[2]: in_collision = True if not in_collision: pts.append(s) return pts @property def polygons(self): return self._polygons
def matrix_from_vertices_gen_cocycle(pointcloud, coh_gen, deaths, cocycle, co_death): N = len(pointcloud) kd_tree = KDTree(pointcloud, leaf_size=2) close_neighbors = kd_tree.query_radius(pointcloud, r=co_death) close_neighbors = [ np.array([ j for j in close_neighbors[i] if j != i and np.linalg.norm(pointcloud[i] - pointcloud[j]) < co_death ]) for i in range(N) ] close_neighbors = np.array(close_neighbors) # start by having as generators only the coboundaries of vertices gens = [] for i in range(N): gen = [] for j in close_neighbors[i]: gen.append([i, j, -1]) if len(gen) > 0: gens.append(gen) true_coh_gen = [] for g, d in zip(coh_gen, deaths): g_ = [] for e in g: i, j, v = e if np.linalg.norm(pointcloud[i] - pointcloud[j]) < min( co_death, d): g_.append(e) true_coh_gen.append(g_) gens = list(true_coh_gen) + list(gens) + [cocycle] ### rows should be indexed by pairs of ordered and distinct points within distance co_death rows = [(i, j) for i in range(N) for j in close_neighbors[i] if i < j] edge_to_row = dict([]) for n, p in enumerate(rows): edge_to_row[p] = n M = np.zeros((len(rows), len(gens)), dtype=int) for col, g in enumerate(gens): for e in g: i, j, v = e if i < j: M[edge_to_row[(i, j)], col] = v else: M[edge_to_row[(j, i)], col] = -v return M
def compute_neighbours(data, radius, sort=False): """Transform function""" tree = KDTree(data) neighbourhoods = tree.query_radius(data, r=radius, return_distance=False) if sort: for n in neighbourhoods: n.sort() return neighbourhoods
def count(data,distance): data['x'],data['y']= getcoord(data) treedt= data[['x','y']] buildTree = KDTree(treedt) point =data[['x','y']] res = buildTree.query_radius(point,r=distance,count_only=True) #print("count results ",res ) return res[len(res)-1]-1
def GetPeriodicDuplicatePoints(inPoints, intNumberOfNeighbours: int, fltRadius: float,inCellVectors): intLength = len(inPoints) #assumes a lattice configuration with fixed number of neighbours if intLength > 0: arrWrappedPoints = AddPeriodicWrapper(inPoints, inCellVectors, 4*fltRadius) objSpatial = KDTree(arrWrappedPoints) arrCounts = objSpatial.query_radius(inPoints, fltRadius, count_only=True) arrBoundaryIndices = np.where(arrCounts < intNumberOfNeighbours+1)[0] arrBoundaryIndices = arrBoundaryIndices[arrBoundaryIndices < intLength] return arrBoundaryIndices else: return []
def find_neighbors(self, x, X): """Find the x's neighbors (points within eps) using sklearn KDTree. Return the x's neighbors indices. """ kdt = KDTree(X, leaf_size=5) ind = kdt.query_radius([x], r=self.eps) neighs = [] for i in ind[0]: if (tuple(X[i]) != tuple(x)): neighs.append(i) return neighs
def density(features, objectives, density_radius=1): """ Count of objectives within a given radius r to points in features""" if not isinstance(features, np.ndarray): features = features.locs if not isinstance(objectives, np.ndarray): objectives = objectives.locs tree = KDTree(objectives) density = tree.query_radius(features, count_only=True, r=density_radius) return minmax(density)
def fit_predict(self, data): cluster_counter = 0 labels = dict() tree = KDTree(data, metric=self.metric, leaf_size=10) for index_row, row in enumerate(data): print(row[0], row[1]) if row[0] == 5.2 and row[1] == 2.3: print() if index_row in labels: continue neighbors = list(tree.query_radius([row], r=self.eps)[0]) if len(neighbors) < self.min_samples: labels[index_row] = -1 continue labels[index_row] = cluster_counter for neighbor_index in neighbors: if neighbor_index in labels and labels[neighbor_index] == -1: labels[neighbor_index] = cluster_counter if neighbor_index in labels and not : continue labels[neighbor_index] = cluster_counter add_neighbors = list(tree.query_radius([data[neighbor_index]], r=self.eps)[0]) if len(add_neighbors) >= self.min_samples: new_add_neighbors = [x for x in add_neighbors if x not in neighbors] neighbors.extend(new_add_neighbors) cluster_counter += 1 res = np.array([labels[key] for key in sorted(labels)]) return res
def sequential_addition(X, r, order=None): tree = KDTree(X, metric='euclidean') D, _ = tree.query(X, 10) order = order or D[:, 1:].mean(axis=1).argsort() visited = np.zeros(len(X), dtype=np.bool) for i in order: if not visited[i]: yield i iis, = tree.query_radius([X[i]], r, return_distance=False) visited[iis] = True
def calc_spatial_neighbor(X_spatial, eps, leaf_size): ''' 使用 kdtree 计算空间近邻 主要是借助kdtree解决大数据量的计算问题 ''' tree = KDTree(X_spatial, leaf_size=leaf_size) ind = tree.query_radius(X_spatial, eps, return_distance=False, count_only=False, sort_results=False) return ind
class hierarchical_search: def __init__(self, points, leaf_size): self.tree = KDTree(points, leaf_size=leaf_size) self.points = points def query_radius(self, queries, radius): indices = self.tree.query_radius(queries, radius) neighborhoods = [self.points[indice] for indice in indices] return neighborhoods
def _mi1(self, x, y): assert x.shape[0] == y.shape[0] if np.array_equal(x, y): return self._entropy(x) N, d1 = x.shape d2 = y.shape[1] xy = np.hstack((x, y)) treeXY = scsp.cKDTree(xy) # find the distance to the k:th neighbour in for every point in (x,y) space Kdists = treeXY.query( xy, k=self.k + 1, p=float("inf") )[0][:, self.k] # k + 1 since 1st neighbour is always the point itself treeX = KDTree(x, metric='chebyshev') treeY = KDTree(y, metric='chebyshev') # look points whose distance to query point is strictly less than the corresponding Kdist MULTIP = 1 - 1e-10 Kdists = MULTIP * Kdists n_x = treeX.query_radius(x, Kdists, count_only=True) n_y = treeY.query_radius(y, Kdists, count_only=True) mi = psi(self.k) + psi(N) - (np.mean(psi(n_x)) + np.mean(psi(n_y))) if self.p == 2: norm_term = self._log_cd(d1) - self._log_cd(d2) return mi + norm_term if self.p == float("inf"): return mi return mi
def cpec_density(total_cpec_coords, radius=310.559): """ :param total_cpec_coords: :type total_cpec_coords: :param radius: :type radius: :return: :rtype: """ tc = KDTree(total_cpec_coords) total = tc.query_radius(total_cpec_coords, r=radius, count_only=True) return
def find_density_reachable_points(dataset, maximum_distance): """Creates the density-reachable matrix of the dataset The return is a dict that maps each point index (zero-based) to a tuple of indices for the points in its neighborhood """ element_count = dataset.shape[0] kdtree = KDTree(dataset, metric="euclidean") neighborhoods = kdtree.query_radius(X=dataset, r=maximum_distance) density_reachable = dict() for element_index in range(element_count): density_reachable[element_index] = tuple(neighborhoods[element_index]) return density_reachable
def find_tree_neighbors(self, atoms, probe): points = [] p = np.ones((len(atoms), 1), dtype=np.int32) radius = atoms[:, 3] + probe + probe for i in range(len(atoms)): points.append([atoms[i, 0], atoms[i, 1], atoms[i, 2]]) tree = KDTree(points, leaf_size=2) print 'RADIUS=', radius, '\n' all_nn_indices = tree.query_radius(points, r=np.transpose(radius)) # NNs return all_nn_indices
def samples_within_sphere(raw_data, radius): print("Creating Features based on samples within sphere") num_samples, num_features = raw_data.shape samples_pct = np.floor(num_samples/100) # Initialize feature output matrix if isinstance(radius,list): features = np.zeros((num_samples,len(radius))) max_r = max(radius) else: features = np.zeros((num_samples,1)) max_r = radius # Make KDTree for nearest neighbor queries tree = KDTree(raw_data) # Query of the number of samples with in sphere from the KDTree count = tree.query_radius(raw_data, r=radius, count_only=True) # Create a progress bar pbar = create_pbar(num_samples) # Iterate all points for idx, point in enumerate(raw_data): if isinstance(radius,list): #ind, dist = tree.query_radius([point], r=max_r, return_distance = True) for i,r in enumerate(radius): # Query of the number of samples with in sphere from the KDTree count = tree.query_radius([point], r=r, count_only=True) features[idx,i] = count[0] else: features[idx] = count[idx] # Update the progressbar. Though only in hole percentages to save time if idx % samples_pct == 0: pbar.update(idx) pbar.finish() return features
def counts_2d_2pt_from_pos(pos_sdss, pos_rm, lam, rmax=220., reso=2.): # preliminaries nsdss = pos_sdss.shape[0] nrm = pos_rm.shape[0] # figure out which lambda bins each RM cluster goes into lam_bin = get_lambda_bin(lam) n_lam_bin = len(set(lam_bin)) # build a couple of KDTree's, one for SDSS, one for RM. from sklearn.neighbors import KDTree tree_sdss = KDTree(pos_sdss, leaf_size=30) # define grids for r_pi and r_sigma. rpigrid = np.arange(-rmax, rmax, reso) nrpigrid = len(rpigrid) rsigmagrid = np.arange(0, rmax, reso) nrsigmagrid = len(rsigmagrid) # find all BOSS galaxies within "rmax" Mpc of each RM clusters. print '...querying tree...' #ind, dist = tree_sdss.query_radius(pos_rm, rmax, count_only=False, return_distance=True) print '...done querying tree...' # loop over clusters, calculate (r_pi, r_sigma) for all nearby BOSS galaxies # bin those counts. counts_rpi_rsigma = [np.zeros((nrpigrid+1, nrsigmagrid+1), dtype=np.float) for i in range(n_lam_bin)] for irm in range(nrm): print '%i/%i'%(irm, nrm) #these_ind = ind[irm] these_ind, these_s = tree_sdss.query_radius(pos_rm[irm,:], rmax, count_only=False, return_distance=True) these_ind = these_ind[0] these_s = these_s[0] if len(these_ind)==0: continue this_pos_rm = pos_rm[irm, :] these_pos_sdss = pos_sdss[these_ind, :] #these_s = dist[irm] these_mu = dot_los2(this_pos_rm, these_pos_sdss) these_rpi = these_s*these_mu these_rsigma = these_s*np.sqrt((1.-these_mu**2.)) ind_rpi = np.digitize(these_rpi, rpigrid) ind_rsigma = np.digitize(these_rsigma, rsigmagrid) this_lam_bin = lam_bin[irm] for this_ind_rpi, this_ind_rsigma in zip(ind_rpi, ind_rsigma): counts_rpi_rsigma[this_lam_bin][this_ind_rpi, this_ind_rsigma] += 1. # normalize # ok, really you'd want to normalize by nrm *per lambda bin*, # but i don't think it will make any material difference. for i in range(n_lam_bin): counts_rpi_rsigma[i] *= (1./nrm/nsdss) return counts_rpi_rsigma
def current_datapoints_threshold_filter(self, neighbour_points = 5): """ Filter from current datapoints, those that do not have enough neighbour points in the 2*max_dist radius (in meters). Assumption: if there is less than neighbour_points around the data point, it can't be a part of event. Method doesn't take into account networks. This method is computationally cheaper, than self.current_datapoints_outliers_filter, so it is used as a prefilter. Method updates self.current_datapoints dict. Args: neighbour_points (int): minimal number of neighbours, every point should have. """ nets = self.current_datapoints.keys() ids = concatenate([self.current_datapoints[x]['ids'] for x in nets]) coords = concatenate([self.current_datapoints[x]['array'] for x in nets]) megatree = KDTree(coords) for net in nets: neighbours_number = megatree.query_radius(self.current_datapoints[net]['array'], r=self.eps*2, count_only=True) self.current_datapoints[net]['array'] = self.current_datapoints[net]['array'][neighbours_number >= neighbour_points] self.current_datapoints[net]['ids'] = self.current_datapoints[net]['ids'][neighbours_number >= neighbour_points]
class BigStarBasis(StarBasis): def __init__(self, libname='', verbose=False, log_interp=True, n_neighbors=0, driver=None, in_memory=False, use_params=None, **kwargs): """An object which holds the stellar spectral library, performs interpolations of that library, and has methods to return attenuated, normalized, smoothed stellar spoectra. This object is set up to work with large grids, so the models file is kept open for acces from disk. scikits-learn kd-trees are required for model access. Ideally the grid should be regular (though the spacings need not be equal along a given dimension). :param libname: Path to the hdf5 file to use for the spectral library. Must have "ckc" or "ykc" in the filename (to specify which kind of loader to use) :param n_neighbors: (default:0) Number of nearest neighbors to use when requested parameters are outside the convex hull of the library prameters. If ``0`` then a ValueError is raised instead of the nearest spectrum. :param verbose: If True, print information about the parameters used when a point is outside the convex hull :param log_interp: (default: True) Interpolate in log(flux) instead of flux. :param in_memory: (default: False) Switch to determine whether the grid is loaded in memory or read from disk each time a model is constructed (like you'd want for very large grids). :param use_params: Sequence of strings. If given, only use the listed parameters (which must be present in the `_libparams` structure) to build the grid and construct spectra. Otherwise all fields of `_libparams` will be used. """ self.verbose = verbose self.logarithmic = log_interp self._libname = libname self.n_neighbors = n_neighbors self._in_memory = in_memory self.load_lib(libname, driver=driver) # Do some important bookkeeping if use_params is None: self.stellar_pars = self._libparams.dtype.names else: self.stellar_pars = tuple(use_params) self.ndim = len(self.stellar_pars) self.lib_as_grid() self.params = {} def load_lib(self, libname='', driver=None): """Read a ykc library which has been preconvolved to be close to your data resolution. This library should be stored as an HDF5 file, with the datasets ``wavelengths``, ``parameters`` and ``spectra``. These are ndarrays of shape (nwave,), (nmodels,), and (nmodels, nwave) respecitvely. The ``parameters`` array is a structured array. The h5 file object is left open so that spectra can be accessed from disk. """ import h5py f = h5py.File(libname, "r", driver=driver) self._wave = np.array(f['wavelengths']) self._libparams = np.array(f['parameters']) if self._in_memory: self._spectra = np.array(f['spectra']) f.close() else: self._spectra = f['spectra'] def get_star_spectrum(self, **kwargs): """Given stellar parameters, obtain an interpolated spectrum at those parameters. :param **kwargs: Keyword arguments must include values for the ``stellar_pars`` parameters that are stored in ``_libparams``. :returns wave: The wavelengths at which the spectrum is defined. :returns spec: The spectrum interpolated to the requested parameters :returns unc: The uncertainty spectrum, where the uncertainty is due to interpolation error. Curently unimplemented (i.e. it is a None type object) """ inds, wghts = self.weights(**kwargs) if self.logarithmic: spec = np.exp(np.dot(wghts, np.log(self._spectra[inds, :]))) else: spec = np.dot(wghts, self._spectra[inds, :]) spec_unc = None return self._wave, spec, spec_unc def weights(self, **params): inds = self.knearest_inds(**params) wghts = self.linear_weights(inds, **params) # if wghts.sum() < 1.0: # raise ValueError("Something is wrong with the weights") good = wghts > 0 # if good.sum() < 2**self.ndim: # raise ValueError("Did not find all vertices of the hypercube, " # "or there is no enclosing hypercube in the library.") inds = inds[good] wghts = wghts[good] wghts /= wghts.sum() return inds, wghts def lib_as_grid(self): """Convert the library parameters to pixel indices in each dimension, and build and store a KDTree for the pixel coordinates. """ # Get the unique gridpoints in each param self.gridpoints = {} for p in self.stellar_pars: self.gridpoints[p] = np.unique(self._libparams[p]) # Digitize the library parameters X = np.array([np.digitize(self._libparams[p], bins=self.gridpoints[p], right=True) for p in self.stellar_pars]) self.X = X.T # Build the KDTree self._kdt = KDTree(self.X) # , metric='euclidean') def params_to_grid(self, **targ): """Convert a set of parameters to grid pixel coordinates. :param targ: The target parameter location, as keyword arguments. The elements of ``stellar_pars`` must be present as keywords. :returns x: The target parameter location in pixel coordinates. """ # bin index inds = np.array([np.digitize([targ[p]], bins=self.gridpoints[p], right=False) - 1 for p in self.stellar_pars]) inds = np.squeeze(inds) # fractional index. Could use stored denominator to be slightly faster try: find = [(targ[p] - self.gridpoints[p][i]) / (self.gridpoints[p][i+1] - self.gridpoints[p][i]) for i, p in zip(inds, self.stellar_pars)] except(IndexError): pstring = "{0}: min={2} max={3} targ={1}\n" s = [pstring.format(p, targ[p], *self.gridpoints[p][[0, -1]]) for p in self.stellar_pars] raise ValueError("At least one parameter outside grid.\n{}".format(' '.join(s))) return inds + np.squeeze(find) def knearest_inds(self, **params): """Find all parameter ``vertices`` within a sphere of radius sqrt(ndim). The parameter values are converted to pixel coordinates before a search of the KDTree. :param params: Keyword arguments which must include keys corresponding to ``stellar_pars``, the parameters of the grid. :returns inds: The sorted indices of all vertices within sqrt(ndim) of the pixel coordinates, corresponding to **params. """ # Convert from physical space to grid index space xtarg = self.params_to_grid(**params) # Query the tree within radius sqrt(ndim) try: inds = self._kdt.query_radius(xtarg.reshape(1, -1), r=np.sqrt(self.ndim)) except(AttributeError): inds = self._kdt.query_ball_point(xtarg.reshape(1, -1), np.sqrt(self.ndim)) return np.sort(inds[0]) def linear_weights(self, knearest, **params): """Use ND-linear interpolation over the knearest neighbors. :param knearest: The indices of the ``vertices`` for which to calculate weights. :param params: The target parameter location, as keyword arguments. :returns wght: The weight for each vertex, computed as the volume of the hypercube formed by the target parameter and each vertex. Vertices more than 1 away from the target in any dimension are given a weight of zero. """ xtarg = self.params_to_grid(**params) x = self.X[knearest, :] dx = xtarg - x # Fractional pixel weights wght = ((1 - dx) * (dx >= 0) + (1 + dx) * (dx < 0)) # set weights to zero if model is more than a pixel away wght *= (dx > -1) * (dx < 1) # compute hyperarea for each model and return return wght.prod(axis=-1) def triangle_weights(self, knearest, **params): """Triangulate the k-nearest models, then use the barycenter of the enclosing simplex to interpolate. """ inparams = np.array([params[p] for p in self.stellar_pars]) dtri = Delaunay(self.model_points[knearest, :]) triangle_ind = dtri.find_simplex(inparams) inds = dtri.simplices[triangle_ind, :] transform = dtri.transform[triangle_ind, :, :] Tinv = transform[:self.ndim, :] x_r = inparams - transform[self.ndim, :] bary = np.dot(Tinv, x_r) last = 1.0 - bary.sum() wghts = np.append(bary, last) oo = inds.argsort() return inds[oo], wghts[oo]
def get_pairwise_velocities_one_hemi(hemi, kmax=0.1, rmax=50.): # create 3d grid object grid = grid3d(hemi=hemi) # load SDSS data sdss = load_sdss_data_both_catalogs(hemi) # load redmapper catalog rm = load_redmapper(hemi=hemi) # get XYZ positions (Mpc) of both datasets x_sdss, y_sdss, z_sdss = grid.xyz_from_radecz(sdss['ra'], sdss['dec'], sdss['z'], applyzcut=False) x_rm, y_rm, z_rm = grid.xyz_from_radecz(rm['ra'], rm['dec'], rm['z_spec'], applyzcut=False) pos_sdss = np.vstack([x_sdss, y_sdss, z_sdss]).T pos_rm = np.vstack([x_rm, y_rm, z_rm]).T # build a KDTree for SDSS LRG's. from sklearn.neighbors import KDTree tree_sdss = KDTree(pos_sdss, leaf_size=30) # find those RM clusters that have some number of LRG's within X Mpc. #rmax = 300. # Mpc lrg_counts = tree_sdss.query_radius(pos_rm, rmax, count_only=True) ind, dist = tree_sdss.query_radius(pos_rm, rmax, count_only=False, return_distance=True) min_counts = np.percentile(lrg_counts, 10) #min_counts = 500. #wh_use = np.where(lrg_counts>min_counts)[0] #for k in rm.keys(): rm[k] = rm[k][wh_use] #lrg_counts = lrg_counts[wh_use] #ind = ind[wh_use] #dist = dist[wh_use] #pos_rm = pos_rm[wh_use, :] # loop over RM clusters, get vlos ncl = len(rm['ra']) vlos = np.zeros(ncl) rmin = 5.#Mpc, tmpp, worth exploring #r_pivot = 10. #r_decay = 10. redshift_grid = np.arange(0.05, 0.7, 0.01) rfine = np.arange(rmin-1, rmax+1,1.) # create a dictionary containing interpoltor objects, keyed on redshift corr_delta_vel_dict = {} from scipy import interpolate for redshift in redshift_grid: corr_delta_vel_dict[redshift] = interpolate.interp1d(rfine, corr_delta_vel(rfine, z=redshift, kmax=kmax)) #distance_weight = print '*********** using kmax=%0.2f, rmax=%i'%(kmax, rmax) for i in range(ncl): print i,ncl if (lrg_counts[i]<min_counts): continue wh_not_too_close = np.where(dist[i]>rmin)[0] these_dist = dist[i][wh_not_too_close] these_ind = ind[i][wh_not_too_close] # get 3d positions these_pos_sdss = pos_sdss[these_ind, :] this_pos_rm = pos_rm[i, :] # dot with line of sight these_dot_los = dot_los(this_pos_rm, these_pos_sdss) this_redshift = rm['z_spec'][i] closest_redshift = redshift_grid[np.argmin(np.abs(redshift_grid-this_redshift))] this_corr_delta_vel = corr_delta_vel_dict[closest_redshift] these_vel = this_corr_delta_vel(these_dist) #ipdb.set_trace() #these_vel = corr_delta_vel(these_dist, z=this_redshift, kmax=kmax) #these_vel = np.exp(-(these_dist-r_pivot)/r_decay) #these_vel = np.exp(-0.5*(these_dist/r_decay)**2.) these_vlos = these_vel*these_dot_los this_vlos = np.sum(these_vlos) #tmpp, sum or mean? #indsort=np.argsort(these_dist) #pl.clf(); pl.plot(these_dist[indsort], np.cumsum(these_vlos[indsort]),'.') #ipdb.set_trace() vlos[i] = this_vlos rm['vlos'] = vlos rm['weight'] = np.ones(ncl) return rm
def avgdigamma(points,dvec,metric='minkowski', p=float('inf')): tree = KDTree(points, metric=DistanceMetric.get_metric(metric,p=p)) num_points = tree.query_radius(points, dvec - 1e-15, count_only=True) return np.sum(digamma(num_points) / len(points) )
prospective_centers.append([xvc, yvc, zvc]) print('Number of empty cells: ' + repr(nvtot)) print('Proceeding to construct KD-Tree...') print('') tree = KDTree(pos, leaf_size=5) print('KD-Tree successfully constructed') print('') counter = 0 for center in prospective_centers: counter = counter + 1 print('Center ' + repr(counter) + ' of ' + repr(len(prospective_centers))) ind, dist = tree.query_radius(center, r=maxvoid_radius,\ return_distance=True, sort_results=True) countfail = len(dist([0])) doIwrite = False for i in reversed(range(len(dist[0]))): if countfail / (4./3 * numpy.pi * dist[0][i] ** 3) < thresh * rhomed: radiofail = dist[0][i] iifail = len(dist[0][:i]) doIwrite = True countfail = countfail - 1 break if doIwrite: numpy.savetxt(outfile, (center[0], center[1], center[2],\ radiofail, iifail), newline=' ')
def _total_correlation_ksg_sklearn(data, rvs, crvs=None, k=4, noise=1e-10): """ Compute the total correlation from observations. The total correlation is computed between the columns specified in `rvs`, given the columns specified in `crvs`. This utilizes the KSG kNN density estimator, and works on discrete, continuous, and mixed data. Parameters ---------- data : np.array Real valued time series data. rvs : iterable of iterables The columns for which the total correlation is to be computed. crvs : iterable The columns upon which the total correlation should be conditioned. k : int The number of nearest neighbors to use in estimating the local kernel density. noise : float The standard deviation of the normally-distributed noise to add to the data. Returns ------- tc : float The total correlation of `rvs` given `crvs`. Notes ----- The total correlation is computed in bits, not nats as most KSG estimators do. This implementation uses scikit-learn. """ # KSG suggest adding noise (to break symmetries?) data = _fuzz(data, noise) if crvs is None: crvs = [] digamma_N = digamma(len(data)) log_2 = np.log(2) all_rvs = list(flatten(rvs)) + crvs rvs = [rv + crvs for rv in rvs] d_rvs = [len(data[0, rv]) for rv in rvs] tree = KDTree(data[:, all_rvs], metric="chebyshev") tree_rvs = [KDTree(data[:, rv], metric="chebyshev") for rv in rvs] epsilons = tree.query(data[:, all_rvs], k + 1)[0][:, -1] # k+1 because of self n_rvs = [t.query_radius(data[:, rv], epsilons, count_only=True) for rv, t in zip(rvs, tree_rvs)] log_epsilons = np.log(epsilons) h_rvs = [-digamma(n_rv).mean() for n_rv, d in zip(n_rvs, d_rvs)] h_all = -digamma(k) if crvs: tree_crvs = KDTree(data[:, crvs], metric="chebyshev") n_crvs = tree_crvs.query_radius(data[:, crvs], epsilons, count_only=True) h_crvs = -digamma(n_crvs).mean() else: h_rvs = [h_rv + digamma_N + d * (log_2 - log_epsilons).mean() for h_rv, d in zip(h_rvs, d_rvs)] h_all += digamma_N + sum(d_rvs) * (log_2 - log_epsilons).mean() h_crvs = 0 tc = sum([h_rv - h_crvs for h_rv in h_rvs]) - (h_all - h_crvs) return tc / log_2
def __init__(self, pts, k=None, r=None, kmax=None, rmax=None): """ Parameters ---------- pts : array, shape(n, d) Data points. Should be already normalized if necessary. k : int Neighbors used to estimate the local density rho. kmax : int If given, only search the nearest kmax neighbors to calculate delta. kmax is equivalent to search a sphere of size about kmax**(1/d) times the local average separation between points. Default is to search all points. rmax : float If given, only search the neighbors within rmax to calculate delta. Default is to search all points. Todos ----- Optimal choice of k and gamma Performance optimization with Cython or Numba Substructure within density saddle point Labeling the noise """ if (k is not None) and (r is not None): raise ValueError("Only one of 'k' or 'r' can be specified!") if (kmax is not None) and (rmax is not None): raise ValueError("Only one of 'kmax' or 'rmax' can be specified!") pts = np.asfarray(pts) npts, ndim = pts.shape Rmax = np.linalg.norm(pts.max(0) - pts.min(0)) tree = KDTree(pts) # density if r is not None: k = tree.query_radius(pts, r, count_only=True) elif k is not None: r = tree.query(pts, k)[0][:, -1] sphere_coeff = np.pi**(0.5 * ndim) / gamma_func(0.5 * ndim + 1) rho = k / (sphere_coeff * r**ndim) rho[rho == 0] = rho[rho > 0].min() / 2 # reduce by an arbitrary factor # delta delta = np.full(npts, Rmax, dtype='float') chief = np.full(npts, -1, dtype='int') # superior neighbor if kmax is not None or rmax is not None: if kmax is not None: dists, index = tree.query( pts, kmax, return_distance=True, sort_results=True) else: index, dists = tree.query_radius( pts, rmax, return_distance=True, sort_results=True) for i in range(npts): rho_i = rho[i] for j, dist in zip(index[i], dists[i]): if (rho[j] > rho_i): chief_i, delta_i = j, dist break chief[i], delta[i] = chief_i, delta_i else: dists = squareform(pdist(pts)) for i in range(npts): rho_i, delta_i = rho[i], delta[i] for j, dist in enumerate(dists[i]): if (rho[j] > rho_i) and (dist < delta_i): chief_i, delta_i = j, dist chief[i], delta[i] = chief_i, delta_i # gamma gamma = sphere_coeff * rho * delta**ndim # need sphere_coeff? sorted_index = np.argsort(gamma) sorted_gamma = gamma[sorted_index] # properties self.npts = npts self.ndim = ndim self.pts = pts self.rho = rho self.delta = delta self.gamma = gamma self.chief = chief self.sorted_index = sorted_index self.sorted_gamma = sorted_gamma
class GlobalSynthCat(object): """ A class for synthetic catalogs with a KDTree attribute to allow for super fast queries. """ def __init__(self, cat_fn=None, catalog=None, cat_params={'nsynths':100}): from sklearn.neighbors import KDTree if cat_fn is not None: self.cat = Table.read(cat_fn) elif catalog is not None: self.cat = catalog else: self.cat = build_synthetic_galaxy_catalog(**cat_params) self.cat['synth_id'] = np.arange(1, len(self.cat) + 1) xyz = ra_dec_to_xyz(self.cat['ra'], self.cat['dec']) self.kdt = KDTree(np.asarray(xyz).T) def query_radius(self, ra, dec, r): """ Search for sources around coordinate within circle of radius r in arcseconds. """ xyz = np.array(ra_dec_to_xyz(ra, dec)).T.reshape(1, -1) idx = self.kdt.query_radius( xyz, angular_dist_to_euclidean_dist(r / 3600.0), count_only=False, return_distance=False)[0] return self.cat[idx] def get_exp_synths(self, exp, search_radius=720): """ Get synths in that fall within the given exposure. """ wcs = exp.getWcs() xc, yc = exp.getDimensions()//2 + exp.getXY0() coord = wcs.pixelToSky(lsst.afw.geom.Point2D(xc, yc)) ra_c, dec_c = coord.getRa().asDegrees(), coord.getDec().asDegrees() cat = self.query_radius(ra_c, dec_c, search_radius).copy() if len(cat) > 0: mask = np.zeros(len(cat), dtype=bool) cat['x'] = -1 cat['y'] = -1 for i, src in enumerate(cat): sky_coord = lsst.afw.geom.SpherePoint( src['ra'] * lsst.afw.geom.degrees, src['dec'] * lsst.afw.geom.degrees) xy_coord = wcs.skyToPixel(sky_coord) if exp.getBBox().contains(lsst.afw.geom.Point2I(xy_coord)): mask[i] = True x0, y0 = xy_coord - exp.getXY0() cat[i]['x'] = x0 cat[i]['y'] = y0 cat = cat[mask] return cat def write(self, fn): self.cat.write(fn, overwrite=True)