def centroid(X: np.ndarray, tree: BallTree) -> np.ndarray: '''Find the centroid of the distribution given by X.''' # Find an appropriate radius. radius = determine_radius(X, tree) rho_max = 0 runs = [] # Make sure to sample the whole space. for init in range(20): # Choose a random initilization. points = [X[np.random.choice(X.shape[0])]] density = len(tree.query_radius(two_d(points[-1]), r=radius)[0]) # Start MCMC-esque exploration procedure. for i in range(100): potential = tree.query_radius(two_d(points[-1]), r=radius) new_point = X[np.random.choice(potential[0])] new_density = len(tree.query_radius(two_d(new_point), r=radius)[0]) if np.random.random() < (new_density / density): points.append(new_point) density = new_density if rho_max < density: rho_max = density best_run = init runs.append(points) return np.array(runs[best_run]), radius, densest(runs[best_run], tree, radius)
def _calc_tree(xx, yy, radius): X = np.zeros((len(xx), 2), dtype='float') X[:, 0] = xx[:] X[:, 1] = yy[:] tree = BallTree(X, metric='euclidean') ind = tree.query_radius(X, r=radius) ind_sw = tree.query_radius(X, r=VARIANCE_RADIUS_SW) return ind, ind_sw
def _predict_gam(ds, conf, time, quantiles=None, size=None, return_gam=False, return_counts=False, max_time_diff=200): # insert 0s for every timeseries in the ensemble for the reference # period at -35 BP (1985) climate = conf.climate + '_ensemble' age = conf.age + '_ensemble' x = ds[age].values.ravel() y = ds[climate].values.ravel() mask = (~np.isnan(x)) & (~np.isnan(y)) if not mask.any(): return else: x = x[mask] y = y[mask] gam = pygam.LinearGAM(pygam.s(0)).gridsearch( x[:, np.newaxis], y, progress=False) time = np.asarray(time) ret = (gam.predict(time), ) if quantiles is not None: ret = ret + (gam.prediction_intervals(time, quantiles=quantiles), ) if size is not None: ret = ret + (gam.sample( x[:, np.newaxis], y, sample_at_X=time, n_draws=size).T, ) if return_counts: tree = BallTree(ds[age].values.ravel()[:, np.newaxis]) counts = tree.query_radius(time[:, np.newaxis], return_counts, count_only=True).astype(float) ret = ret + (counts, ) # look how many samples in the ensemble fall into the `max_time_diff` # time interval around the predicted time tree = BallTree(ds[age].values.ravel()[:, np.newaxis]) counts = tree.query_radius(time[:, np.newaxis], max_time_diff, count_only=True) idx = counts < 100 if idx.any(): for arr in ret: arr[idx] = np.nan if return_gam: return ret + (gam, ) else: return ret
def get_nearest(infected_coordinates, uninfected_coordinates, d): """ This method returns the indices and distances of the uninfected users that are within a distance "d"(paramater) of the infected users. Input: ------ @infected_coordinates: array Latitude and lontitude of GPS coordinates of infected users. @uninfected_coordinates: array Latitude and lontitude of GPS coordinates of uninfected users. @d : int distance parameter Output: ------- @indices : array indices of the uninfected users that are within a distance "d" of the infected users. @distances : array distance fron uninfected users to infected users. """ # Create tree from the GPS coordinates of uninfected users tree = BallTree(uninfected_coordinates, leaf_size=15, metric='haversine') indices, distances = tree.query_radius(infected_coordinates, r=d, return_distance=True) indices = indices.transpose() distances = distances.transpose() return indices, distances
def getMosquitoActivity(lat, long): bites_df = getBitesDF() bites_df = bites_df.drop('image_Base64', 1) bites_df = bites_df.drop('image_name', 1) day_1_ = pd.to_datetime(int(time.time()), unit='s') day_0_ = day_1_ - timedelta(days=7) mask = ((bites_df.index < day_1_) & (bites_df.index > day_0_)) bites_df = bites_df.loc[mask] rad_bites_df = pd.DataFrame() rad_bites_df['timestamp'] = bites_df.index rad_bites_df['lat_rad'] = toRad_vec(bites_df['latitude']) rad_bites_df['long_rad'] = toRad_vec(bites_df['longitude']) rad_bites_df = rad_bites_df.set_index('timestamp', drop=True) bt = BallTree(rad_bites_df.as_matrix(), metric='haversine') indices, distances = bt.query_radius( [latLongtoRad(float(lat)), latLongtoRad(float(long))], r=RADIUS_DEFAULT, return_distance=True) print indices print distances nn_list = indices[0].tolist() return bites_df.iloc[nn_list, :].to_dict(orient='records')
def find_hits_for_targets( *, targets: List[Tuple[float, ...]], predictions: List[Tuple[float, ...]], radius: float, ) -> List[Tuple[int, ...]]: """ Generates a list of the predicted points that are within a radius r of the targets. The indicies are returned in sorted order, from closest to farthest point. Parameters ---------- targets A list of target points predictions A list of predicted points radius The maximum distance that two points can be apart for them to be considered a hit Returns ------- A list which has the same length as the targets list. Each element within this list contains another list that contains the indicies of the predictions that are considered hits. """ predictions_tree = BallTree(array(predictions)) hits, _ = predictions_tree.query_radius(X=targets, r=radius, return_distance=True, sort_results=True) return hits
def get_score_for_ideal_points(points, ideal_points, IDEAL_HEIGHT): model, scene, after_tps = nrr.non_rigid_registration(points, ideal_points) print("Model: ", model) print("Scene: ", scene) print("after_tps: ", after_tps) distances_array = [] ballTree = BallTree(after_tps) i = 0 for point in ideal_points: ind = ballTree.query_radius(point, IDEAL_HEIGHT) if len(ind[0]) == 1: distances_array.append(np.linalg.norm(point - after_tps[ind[0][0]])) else: i += 1 distances_array.append(1000) print("SCORE: ", np.mean(distances_array)) return np.mean(distances_array)
class BallTreeANN: def __init__(self): """ Constructor """ self.nbrs = None def build_index(self, dataset, leaf_size): self.nbrs = BallTree(dataset, leaf_size=leaf_size, metric="euclidean") return self.nbrs def build_store_index(self, dataset, path, leaf_size): self.build_index(dataset, leaf_size) self.store_index(path) def store_index(self, path): with open(path, "wb") as output1: pickle.dump(self.nbrs, output1, pickle.HIGHEST_PROTOCOL) def load_index(self, path): with open(path, "rb") as input1: self.nbrs = pickle.load(input1) def search_in_radious(self, vector, radious=2): distances, indices = self.nbrs.query_radius(vector, r=radious, return_distance=True) return distances, indices def search_neighbors(self, vector, num_neighbors): distances, indices = self.nbrs.query(vector, k=num_neighbors) return distances, indices
def query_neighbors(coords, r2, distance_metric='haversine', weighted=False): """Build a network from a set of points and a threshold distance. Parameters ---------- coords : array-like (N, 2) r2 : float Threshold distance. distance_metric : str Either 'haversine' or None. Returns ------- nodes : list of ints Correspond to the list of nodes edges : list of tuples An edge between two nodes exist if they are closer than r2. singleton nodes : list of ints Nodes that have no connections, e.g. have been visited once. """ # If the metric is haversine update points (to radians) and r2 accordingly. if distance_metric == 'haversine': coords = np.radians(coords) r2 = r2 / 6371000 # Init tree tree = BallTree(coords, metric=distance_metric) # Query return tree.query_radius(coords, r=r2, return_distance=weighted)
def tract_smooth(optional_flags, tractography, var, file_output): from sklearn.neighbors import BallTree var = float(var) std = var**2 points = tractography.original_tracts() all_points = numpy.vstack(points) bt = BallTree(all_points) N = len(all_points) / 3 I = numpy.eye(3)[None, ...] for i, tract in enumerate(tractography.original_tracts()): # all_points = numpy.vstack(points[:i] + points[i + 1:]) # bt = BallTree(all_points) diff = numpy.diff(tract, axis=0) diff = numpy.vstack((diff, diff[-1])) lengths = numpy.sqrt((diff**2).sum(1)) # cum_lengths = numpy.cumsum(lengths) diff_norm = diff / lengths[:, None] tangent_lines = diff_norm[:, None, :] * diff_norm[:, :, None] normal_planes = I - tangent_lines # weight_matrices = normal_planes + 1e10 * tangent_lines N = max(len(d) for d in bt.query_radius(tract, var * 3)) close_point_distances, close_point_indices = bt.query(tract, N) close_points = all_points[close_point_indices] difference_vectors = close_points - tract[:, None, :] projected_vectors = (normal_planes[:, None, :] * difference_vectors[..., None]).sum(-2) projected_points = projected_vectors + tract[:, None, :] # projected_distances2 = (projected_vectors**2).sum(-1) # projected_weights = numpy.exp(- .5 * projected_distances2 / std) # projected_weights /= projected_weights.sum(-1)[:, None] weights = numpy.exp(-.5 * close_point_distances**2 / std)[..., None] weights /= weights.sum(-2)[..., None] # tract += (weights * projected_vectors).sum(-2) # weighted_distances = ( # weight_matrices[:, None, :] * # difference_vectors[..., None] # ).sum(-2) # weighted_distances *= difference_vectors # weighted_distances = weighted_distances.sum(-1) ** .5 # weighted_points = (projected_points * weights).sum(1) weighted_points = (projected_points * weights).sum(1) tract[:] = weighted_points # tract /= norm_term return Tractography(tractography.original_tracts(), tractography.original_tracts_data(), **tractography.extra_args)
def get_nearest_neighbours( df: pd.DataFrame, target_id: int, dmax: Optional[int] = None, extent: Optional[int] = None, ) -> tuple: """ Args: df: halo DataFrame target_id: object id for which to find NNs dmax: maximal distance between objects Return: indices and distances """ pos = df[["theta1_deg", "theta2_deg"]].values pos_i = df[df["id"] == target_id][["theta1_deg", "theta2_deg"]].values if dmax is None: dmax = df[df["id"] == target_id]["r200_deg"].values if extent is not None: dmax *= extent if len(pos_i.shape) == 1: pos_i = pos_i[np.newaxis, :] btree = BallTree(pos) pairs = btree.query_radius(pos_i, dmax, return_distance=True,) return pairs[0][0], pairs[1][0]
def faithful_downsampling(data: np.array, h: float): """ An implementation of faithful downsampling as described in: Zare H, Shooshtari P, Gupta A, Brinkman R. Data reduction for spectral clustering to analyze high throughput flow cytometry data. BMC Bioinformatics 2010;11:403 Parameters ----------- data: Numpy.array numpy array to be down-sampled h: float radius for nearest neighbours search Returns -------- Numpy.array Down-sampled array """ communities = None registered = np.zeros(data.shape[0]) tree = BallTree(data) while not all([x == 1 for x in registered]): i_ = np.random.choice(np.where(registered == 0)[0]) registered[i_] = 1 registering_idx = tree.query_radius(data[i_].reshape(1, -1), r=h)[0] registering_idx = [t for t in registering_idx if t != i_] registered[registering_idx] = 1 if communities is None: communities = data[registering_idx] else: communities = np.unique(np.concatenate( (communities, data[registering_idx]), 0), axis=0) return communities
def getVoxel(seedPoint,rad,cloud): kdt = BallTree(cloud, leaf_size=5,metric='euclidean') #print('Extracting with rad %f'%rad) ind = kdt.query_radius(seedPoint.reshape(1,-1),r=rad) point_ids=np.expand_dims(ind,axis=0)[0,0].reshape(1,-1) print(point_ids.shape) #print(scene_cloud[point_ids[0,:],:].shape) return cloud[point_ids[0,:],:]
def is_repeated_stop(stop_coords, all_coords, distance): if len(all_coords) == 0: return False tree = BallTree(radians(all_coords), leaf_size=2, metric='haversine') result = tree.query_radius(radians([stop_coords]), r=calculate_radius(distance), count_only=True)[0] if result == 0: return False else: return True
def densest_radius(X: np.ndarray, support_idx: np.ndarray, tree: BallTree, d_centroids: float) -> int: '''Identify the support vector with the densest radius.''' return np.argmax([ len(tree.query_radius(np.atleast_2d(X[i]), r=(0.1 * d_centroids))[0]) for i in support_idx ])
def ratio(X: np.ndarray, vector_idx: int, centroid: np.ndarray, d_centroids: float, tree: BallTree) -> float: '''Compute the ratio between the density at the support vector and a centroid.''' density_vector = len( tree.query_radius(np.atleast_2d(X[vector_idx]), r=(0.1 * d_centroids))[0]) density_centroid = len( tree.query_radius(np.atleast_2d(centroid), r=(0.1 * d_centroids))[0]) if density_centroid == 0: density_centroid = 1 print(f'Density of vector location: {density_vector}') print(f'Density of centroid: {density_centroid}') return density_vector / density_centroid
def count_amenity(src_points, candidates, rad): """Find amenity being searched within the stated radius amenity: school, train station, police centre """ # Create tree from the candidate points tree = BallTree(candidates, leaf_size=15, metric='haversine') # Get distance and index of nearest amenity dist, nearest_ind = tree.query(src_points, k=1) dist = dist * 6371000 # Count number of amenity within radius count = tree.query_radius(src_points, r=rad, count_only=True) # Get indexes of all the amenity within radius all_ind = tree.query_radius(src_points, r=rad) return count, dist.ravel(), nearest_ind, all_ind
class StopBallTree: """ This class is a stop friendly implementation of a sklearn ball-tree """ def __init__(self, stops): """ :param stops: list of Stop objects """ self.tree = BallTree(self.stop_2_tup(stops),metric='haversine') self.tree_stops = list(stops) self.R = 3959.87433 * 5280 def stop_2_tup(self,stops): """ This function takes a list of stops and returns a set of tuples with postions in radians :param stops: list of stop.Stop objects :return: np.ndarray with [lat lon] """ _ = np.asarray([(s.lat,s.lon) for s in stops]) return np.radians(_) def query(self, stops): """ This function takes takes the query results from sklearn and reformats them :param query_result: tuple, of lists with [[val],[val]] :return: tuple(distances, matches) """ dist, matches = self.tree.query(self.stop_2_tup(stops)) dist = [self.R * x[0] for x in dist] matches = [x[0] for x in matches] return dist, matches def query_radius(self, stops, radius,earth=False): """ Interface for stops with the sklearn query_radius function :param stops: :return: """ if earth: ind = self.tree.query_radius(self.stop_2_tup(stops), r = radius/self.R ) else: ind = self.tree.query_radius(self.stop_2_tup(stops), r=radius) # need to convert radius into radians distance return {stops[i]:[self.tree_stops[j] for j in ind[i]] for i in range(len(stops))}
def radiusUpdate(encode_record, params): encode_record -= np.mean(encode_record, 0) print(np.max(np.max(encode_record)), np.min(np.min(encode_record))) tree = BallTree(encode_record) neighbor = tree.query_radius(encode_record, 3, count_only=True) + 1 print(np.max(neighbor), np.min(neighbor)) weights = np.power(neighbor, params.alpha) * params.beta return tf.constant(weights, dtype=tf.float32)
def queryNN(X_train, X_test, radius, leaf_size): """ Method that identifies from a dataset the NN most similar cases (Nearest neighbors). X_train: dataset to find neighbours X_test: dataset to find neighbors for BallTree_leaf_size: leaf size of kd tree radius: radius in high dimensional space to search for NNs Returns: counts: count of NNs for each datapoint indices: indices of NNs from dataset X_train """ tree = BallTree(X_train, leaf_size=leaf_size) counts = tree.query_radius(X_test, r=radius, count_only=True) indices = tree.query_radius(X_test, r=radius) return counts, indices
def spatial_expression_internal(adata_subset, x_coordinate, y_coordinate, method, radius, knn, imageid, use_raw, subset, label): # Create a DataFrame with the necessary inforamtion data = pd.DataFrame({ 'x': adata_subset.obs[x_coordinate], 'y': adata_subset.obs[y_coordinate] }) # Identify neighbourhoods based on the method used # a) KNN method if method == 'knn': print("Identifying the " + str(knn) + " nearest neighbours for every cell") tree = BallTree(data, leaf_size=2) dist, ind = tree.query(data, k=knn, return_distance=True) # b) Local radius method if method == 'radius': print("Identifying neighbours within " + str(radius) + " pixels of every cell") kdt = BallTree(data, metric='euclidean') ind, dist = kdt.query_radius(data, r=radius, return_distance=True) # Normalize range (0-1) and account for total number of cells d = scipy.sparse.lil_matrix((len(data), len(data))) for row, (columns, values) in enumerate(zip(ind, dist)): # Drop self-distance element. idx = columns != row columns = columns[idx] values = values[idx] if len(values) == 1: values = [1.0] elif len(values) > 1: # Normalize distances. values = (values.max() - values) / (values.max() - values.min()) values /= values.sum() # Assign row to matrix. d[row, columns] = values # convert to csr sparse matrix wn_matrix_sparse = d.tocsr() # Calculation of spatial lag if use_raw == True: spatial_lag = pd.DataFrame(wn_matrix_sparse * np.log1p(adata_subset.raw.X), columns=adata_subset.var.index, index=adata_subset.obs.index) else: spatial_lag = pd.DataFrame(wn_matrix_sparse * adata_subset.X, columns=adata_subset.var.index, index=adata_subset.obs.index) # return value return spatial_lag
def triangles_from_keypoints(keypoints, lower=TRIANGLE_LOWER, upper=TRIANGLE_UPPER): """Get Triangles from keypoints. >>> from .keypoints import compute_keypoints >>> filename = 'fullEndToEndDemo/inputImages/cat_original.png' >>> img = cv2.imread(filename) >>> keypoints = compute_keypoints(img) >>> res = triangles_from_keypoints(keypoints) >>> len(res) 11590 >>> print(list(map(lambda x: x.tolist(), res[0]))) [[162.0, 203.0], [261.0, 76.0], [131.0, 63.0]] >>> res2 = triangles_from_keypoints(keypoints, lower=10) >>> len(res2) 14238 >>> res3 = triangles_from_keypoints(keypoints, upper=100) >>> len(res3) 315 """ keypoints = np.asarray(keypoints, dtype=float) tree = BallTree(keypoints, leaf_size=10) i_lower = tree.query_radius(keypoints, r=lower) i_upper = tree.query_radius(keypoints, r=upper) in_range = [set(u) - set(l) for l, u in zip(i_lower, i_upper)] seen = set() result = [] for i, center in enumerate(keypoints): seen.add(i) in_range_of_center = in_range[i] - seen if not in_range_of_center: continue processed = set() for j in in_range_of_center: if j < i + 1: continue points_idx = in_range[j] & in_range_of_center - processed if not points_idx: continue keypoint = keypoints[j] points = keypoints[list(points_idx)] area = np.absolute(np.cross(points - center, points - keypoint)) / 2 result += [(center, keypoint, p) for p in points[area > 1300]] processed.add(j) return result
def __calculate_lonely_points(grid, point_cloud, distance): # Generate BallTree for point cloud ball_tree = BallTree(point_cloud.get_xy(), metric='manhattan') # Calculate for each of the points in the grid, the amount of neighbors in the original ground cloud count = ball_tree.query_radius(grid, distance - EPSILON, count_only=True) # Return only the points in the grid that don't have a neighbor return grid[count == 0]
def spatial_coactive_sets(population, spkdict, time_bins, trajectory, return_tree=False): """ Estimates spatially co-active activity ensembles from the given spike dictionary. """ import sklearn from sklearn.neighbors import BallTree x, y, d, t = trajectory pch_x = interpolate.pchip(t, x) pch_y = interpolate.pchip(t, y) spatial_bins = np.column_stack( [pch_x(time_bins[:-1]), pch_y(time_bins[:-1])]) acv_dict = { gid: np.histogram(np.asarray(lst), bins=time_bins)[0] for (gid, lst) in viewitems(spkdict[population]) if len(lst) > 1 } n_features = len(time_bins) - 1 n_samples = len(acv_dict) active_gid = {} active_bins = np.zeros((n_samples, n_features), dtype=np.bool) for i, (gid, acv) in enumerate(viewitems(acv_dict)): active_bins[i, :] = acv > 0 active_gid[i] = gid tree = BallTree(active_bins, metric='jaccard') qbins = np.zeros((n_features, n_features), dtype=np.bool) for ibin in range(n_features): qbins[ibin, ibin] = True nnrs, nndists = tree.query_radius(qbins, r=1, return_distance=True) fnnrs = [] fnndists = [] for i, (nns, nndist) in enumerate(zip(nnrs, nndists)): inds = [ inn for inn, nn in enumerate(nns) if np.any(np.logical_and(active_bins[nn, :], active_bins[i, :])) ] fnns = np.asarray([nns[inn] for inn in inds]) fdist = np.asarray([nndist[inn] for inn in inds]) fnnrs.append(fnns) fnndists.append(fdist) if return_tree: return n_samples, spatial_bins, fnnrs, fnndists, (tree, active_gid) else: return n_samples, spatial_bins, fnnrs, fnndists
def eval(self, X): """Evaluate the kernel density estimation Parameters ---------- X : array_like array of points at which to evaluate the KDE. Shape is (n_points, n_dim), where n_dim matches the dimension of the training points. Returns ------- dens : ndarray array of shape (n_points,) giving the density at each point. The density will be normalized for metric='gaussian' or metric='tophat', and will be unnormalized otherwise. """ X = np.atleast_2d(X) if X.ndim != 2: raise ValueError('X must be two-dimensional') if X.shape[1] != self.X_.shape[1]: raise ValueError('dimensions of X do not match training dimension') if self.metric == 'gaussian': # wrangle gaussian into scikit-learn's 'rbf' kernel gamma = 0.5 / self.h / self.h D = pairwise_kernels(X, self.X_, metric='rbf', gamma=gamma) D /= np.sqrt(2 * np.pi * self.h ** (2 * X.shape[1])) dens = D.sum(1) elif self.metric == 'tophat': # use Ball Tree to efficiently count neighbors bt = BallTree(self.X_) counts = bt.query_radius(X, self.h, count_only=True) dens = counts / n_volume(self.h, X.shape[1]) elif self.metric == 'exponential': D = pairwise_distances(X, self.X_) dens = np.exp(-abs(D) / self.h) dens = dens.sum(1) dens /= n_volume(self.h, X.shape[1]) * special.gamma(X.shape[1]) elif self.metric == 'quadratic': D = pairwise_distances(X, self.X_) dens = (1 - (D / self.h) ** 2) dens[D > self.h] = 0 dens = dens.sum(1) dens /= 2. * n_volume(self.h, X.shape[1]) / (X.shape[1] + 2) else: D = pairwise_kernels(X, self.X_, metric=self.metric, **self.kwargs) dens = D.sum(1) return dens
def _binned_mean(ds, conf, time, quantiles=None, size=None, return_counts=False, max_time_diff=200): climate = conf.climate + '_ensemble' age = conf.age + '_ensemble' agedim = ds[conf.age].dims[0] ens = conf.ensemble def bootstrap_mean(da): resampler = np.random.randint(0, len(da), (len(da), size)) return xr.DataArray(da.values[resampler].mean(axis=0), dims=(ens, )) ds = ds.stack(**{age + ens: (agedim, ens)}) time = pd.Index(time) tree = BallTree(time[:, np.newaxis]) ind, dists = tree.query_radius( ds[age].values[:, np.newaxis], max_time_diff, return_distance=True, sort_results=True) miss = len(time) ind = np.array([t[0] if t.size else miss for t in ind]) grouper = ds[age + ens].copy(data=np.r_[time, [np.nan]][ind]) grouped = ds[climate].groupby(grouper) mask = grouped.count() > 100 ret = (grouped.mean().where(mask), ) if quantiles is not None: ret = ret + (grouped.quantile(quantiles).where(mask), ) if size is not None: ret = ret + (grouped.apply(bootstrap_mean).where(mask), ) if return_counts: tree = BallTree(ds[age].values.ravel()[:, np.newaxis]) counts = tree.query_radius(time[:, np.newaxis], return_counts, count_only=True) ret = ret + (xr.DataArray(counts, dims=ret[0].dims[0], coords={ret[0].dims[0]: time}), ) return tuple(arr.reindex({age + ens: time}).values for arr in ret)
def OrderCell(data, radius): tree = BallTree(data, leaf_size=2) Countnumber = [] for point in range(len(data)): count = tree.query_radius( data[point].reshape(1, -1), r=radius, count_only=True) # counting the number of neighbors for each point Countnumber.append(count) # storing number of neighbors CountnumberDf = pd.DataFrame(Countnumber, columns=['neighbors']) return (CountnumberDf)
def spatial_pscore_internal (adata_subset,proximity,x_coordinate,y_coordinate,phenotype,method,radius,knn, imageid,subset,label): # Create a DataFrame with the necessary inforamtion data = pd.DataFrame({'x': adata_subset.obs[x_coordinate], 'y': adata_subset.obs[y_coordinate], 'phenotype': adata_subset.obs[phenotype]}) # Identify neighbourhoods based on the method used # a) KNN method if method == 'knn': print("Identifying the " + str(knn) + " nearest neighbours for every cell") tree = BallTree(data[['x','y']], leaf_size= 2) ind = tree.query(data[['x','y']], k=knn, return_distance= False) neighbours = pd.DataFrame(ind.tolist(), index = data.index) # neighbour DF neighbours_ind = neighbours.copy() # neighbour DF #neighbours.drop(0, axis=1, inplace=True) # Remove self neighbour # b) Local radius method if method == 'radius': print("Identifying neighbours within " + str(radius) + " pixels of every cell") kdt = BallTree(data[['x','y']], metric='euclidean') ind = kdt.query_radius(data[['x','y']], r=radius, return_distance=False) #for i in range(0, len(ind)): ind[i] = np.delete(ind[i], np.argwhere(ind[i] == i))#remove self neighbours = pd.DataFrame(ind.tolist(), index = data.index) # neighbour DF neighbours_ind = neighbours.copy() # neighbour DF # Map phenotype phenomap = dict(zip(list(range(len(ind))), data['phenotype'])) # Used for mapping phenomap_ind = dict(zip(list(range(len(ind))), data.index)) # Used for mapping cell_nme # Loop through (all functionized methods were very slow) for i in neighbours.columns: neighbours[i] = neighbours[i].dropna().map(phenomap, na_action='ignore') # do the same index and cell name for i in neighbours_ind.columns: neighbours_ind[i] = neighbours_ind[i].dropna().map(phenomap_ind, na_action='ignore') # Idetify all the neighbourhoods that contains the user defined proximity phenotypes for i in proximity: print (str('Finding neighbourhoods with ') + str(i)) nn = neighbours[neighbours.isin([i])].dropna(how='all').index neighbours = neighbours.loc[nn] # Identify all the cells that was part of the neighbourhood in this analysis neighbours_ind = neighbours_ind.loc[neighbours.index] neighbours_ind_unique = pd.unique(neighbours_ind.values.ravel()) # subset the neighbourhood cells to include only the cells in the user defined list cleaned_neighbours_ind_unique = [x for x in neighbours_ind_unique if str(x) != 'nan'] d = data.loc[cleaned_neighbours_ind_unique] d = d[d['phenotype'].isin(proximity)].index # return neighbours for score and image_neighbours for plotting on image return {'neighbours': neighbours.index, 'image_neighbours': d }
def get_mask(gaia_file): print(gaia_file) r_mask = gaia_file_2_radius[os.path.basename(gaia_file)] / 3600. hdu_G = fits.open(gaia_file) ra_gaia, dec_gaia = hdu_G[1].data['ra'], hdu_G[1].data['dec'] gaia_coordinates = deg_to_rad * np.transpose([dec_gaia, ra_gaia]) print('measures distances') Tree_obj_Gaia = BallTree(gaia_coordinates, metric='haversine') test_c = Tree_obj_Gaia.query_radius(agn_coordinates, r = r_mask, count_only = True) to_be_masked = (test_c>0) print('N to mask:', len(to_be_masked.nonzero()[0])) return to_be_masked
class BallTree(): def __init__(self,walkers): self.walkers = walkers self.tree = BT(walkers.getWalkersLocation()) def getEdges(self, radius): results = [] for i,neighbors in enumerate(self.tree.query_radius(self.walkers.getWalkersLocation(),radius)): if len(neighbors) > 0: for n in neighbors: results.append((self.walkers[i],self.walkers[n])) return results
def minPointsEstimate(enc, eps, imgpath): tree = BallTree(np.array(enc)) allNgbr = [] allNgbr.append(tree.query_radius(enc, eps, count_only=True)) _, bins, _ = plt.hist(allNgbr, bins=45) plt.grid(axis='y', alpha=0.75) plt.xticks(bins, rotation=90) plt.title("MinPts Estimate " + encoding) plt.ylabel('Number of sessions') plt.xlabel('Number of neighbors') plt.tight_layout() plt.savefig(os.path.join(imgpath, "minptsEstimate.png")) plt.close()
def lat_errors(T1, T2, thres=2): tree = BallTree(T1.values) inds, dists = tree.query_radius(T2.values, r=thres, sort_results=True, return_distance=True) closest_l = [] for i, ind in enumerate(inds): if len(ind) >= 2: closest = pd.DataFrame({'X1': [T1.iloc[ind[0]].X], 'Y1': [T1.iloc[ind[0]].Y], 'X2': [T1.iloc[ind[1]].X], 'Y2': [T1.iloc[ind[1]].Y]}, index=[i]) closest_l.append(closest) closest_df = pd.concat(closest_l) f = T2.join(closest_df) lat_errors = abs((f.X2-f.X1)*(f.Y1-f.Y) - (f.X1-f.X)*(f.Y2-f.Y1))/np.sqrt((f.X2-f.X1)**2+(f.Y2-f.Y1)**2) return lat_errors
def estimate_bayes_factor(traces, logp, r=0.05, return_list=False): """From astroml, estimates the bayes factor using the local density of points""" D, N = traces.shape # compute volume of a D-dimensional sphere of radius r Vr = np.pi ** (0.5 * D) / scipy.special.gamma(0.5 * D + 1) * (r ** D) # use neighbor count within r as a density estimator bt = BallTree(traces.T) count = bt.query_radius(traces.T, r=r, count_only=True) BF = logp + np.log(N) + np.log(Vr) - np.log(count) if return_list: return BF else: p25, p50, p75 = np.percentile(BF, [25, 50, 75]) return p50, 0.7413 * (p75 - p25)
def avgdigamma(data, dvec, leaf_size=16): """Convenience function for finding expectation value of <psi(nx)> given some number of neighbors in some radius in a marginal space. Parameters ---------- points : numpy.ndarray dvec : array_like (n_points,) Returns ------- avgdigamma : float expectation value of <psi(nx)> """ tree = BallTree(data, leaf_size=leaf_size, p=float('inf')) n_points = tree.query_radius(data, dvec - EPS, count_only=True) return digamma(n_points).mean()
def mean_shift(X, bandwidth, seeds, kernel_update_function, max_iterations=10): n_points, n_features = X.shape stop_thresh = 1e-3 * bandwidth # when mean has converged cluster_centers = [] ball_tree = BallTree(X) # to efficiently look up nearby points # For each seed, climb gradient until convergence or max_iterations for weighted_mean in seeds: completed_iterations = 0 while True: points_within = X[ball_tree.query_radius([weighted_mean], bandwidth*3)[0]] old_mean = weighted_mean # save the old mean weighted_mean = kernel_update_function(old_mean, points_within, bandwidth) converged = extmath.norm(weighted_mean - old_mean) < stop_thresh if converged or completed_iterations == max_iterations: cluster_centers.append(weighted_mean) break completed_iterations += 1 return cluster_centers
def mean_shift_clustering(points, bandwidth, max_iterations=500): stop_thresh = 1e-3 * bandwidth cluster_centers = [] points_labels = [] ball_tree = BallTree(points) for weighted_mean in points: iter = 0 while True: points_within = points[ball_tree.query_radius([weighted_mean], bandwidth*3)[0]] old_mean = weighted_mean weighted_mean = mean_shift(old_mean, points_within, bandwidth) converged = euclid_dist(weighted_mean, old_mean) < stop_thresh if converged or iter == max_iterations: cluster_centers, points_labels = assign_cluster(weighted_mean, cluster_centers, points_labels) break iter += 1 return np.asarray(cluster_centers), np.asarray(points_labels)
xyz = np.zeros((5903, 3)) xyz[:, 0] = x[:, 0] xyz[:, 1] = y[:, 0] xyz[:, 2] = z[:, 0] Xtrain = import_train["Xtrain"] scaler = preprocessing.StandardScaler().fit(Xtrain) Xtrain = scaler.transform(Xtrain) from sklearn.neighbors import kneighbors_graph, BallTree from sklearn.feature_extraction.image import grid_to_graph xyz_balltree = BallTree(xyz) print xyz_balltree print xyz_balltree.query_radius(xyz[0], r=0.04) # connectivity = kneighbors_graph(xyz_balltree, 2, include_self=True,mode='connectivity') # connectivity = grid_to_graph(n_x =x, n_y = y, n_z = z ) # agglo = cluster.FeatureAgglomeration(n_clusters = 590) # agglo.fit(Xtrain) # Xtrain_reduced = agglo.transform(Xtrain) """ #k_fold = cross_validation.KFold(len(X_train), 5) Y_kf = Ytrain.ravel() k_fold = StratifiedKFold(Y_kf, n_folds=10)
def two_point(data, bins, BT_D=None, BT_R=None, method='standard', data_R=None, random_state=None, return_trees=False, verbose=False, RR=None, return_RR=False, return_DD=False): #Edited by CW to allow user to supply more things and have more things #returned. """ Two-point correlation function in Euclidean space. Options to return a number of things. What gets returned is up to the user but the order will always be correlation_function, data_balltree, random_balltree, random_random, data_data. If the user asks for a subset of those, the list will be shorter but the order will be maintained. Parameters ---------- data : array_like Input data, shape = [n_samples, n_features] bins : array_like Bins within which to compute the 2-point correlation. Shape = Nbins + 1 BT_D : BallTree (optional) Ball tree created with the data positions BT_R : BallTree (optional) Ball tree created with the random positions method : string (optional) "standard" or "landy-szalay". Default is 'standard'. data_R : array_like (optional if no BT_R) If specified, use this as the random comparison sample. This must be included if you wish to use a pre-computed random ball tree random_state : integer, np.random.RandomState, or None (optional) Specify the random state to use for generating background. Not used if the randoms are provided by the user. Default is None RR : 1D array-like, shape = Nbins If this exact set of randoms and theta bins has been run, you can supply the RR counts and not calculate them again. You also need the data if you're running with method='landy-szalay' return_trees : boolean (optional) If True, the returns will include the data and random ball trees. Default is False. return_RR : boolean (optional) If you know you'll be running a CF with this exact same random sample and binning (like with a bootstrap), you can get the RR counts returned and feed them back in the next time return_DD : boolean (optional) In case you want to fit to the pair counts rather than the w(theta) estimator, you can get this back too verbose: boolean (optional) Determines whether or not the function narrates what it's doing. Default is False. Returns ------- corr : ndarray the estimate of the correlation function within each bin shape = Nbins data_tree : BallTree (optional) the ball tree used to calculate distances between objects quickly in the data. only returned if return_trees == True random_tree : BallTree (optional) the ball tree used to calculate distances between objects quickly in the randomly generated set. only returned if return_trees == True RR : ndarray (optional) the RR counts may be returned (if return_RR==True) and used again without recomputing if the theta bins and the random sample is exactly the same DD : ndarray (optional) the DD pair counts, returned if return_DD==True """ data = np.asarray(data) bins = np.asarray(bins) rng = check_random_state(random_state) if method not in ['standard', 'landy-szalay']: raise ValueError("method must be 'standard' or 'landy-szalay'") if bins.ndim != 1: raise ValueError("bins must be a 1D array") if data.ndim == 1: data = data[:, np.newaxis] elif data.ndim != 2: raise ValueError("data should be 1D or 2D") n_samples, n_features = data.shape Nbins = len(bins) - 1 # shuffle all but one axis to get background distribution if data_R is None: print "two_point says: generating random sample" data_R = data.copy() for i in range(n_features - 1): rng.shuffle(data_R[:, i]) else: data_R = np.asarray(data_R) if (data_R.ndim != 2) or (data_R.shape[-1] != n_features): raise ValueError('data_R must have same n_features as data') factor = len(data_R) * 1. / len(data) if BT_D is None: if verbose: print "two_point says: computing BallTree for data" BT_D = BallTree(data) if BT_R is None: if verbose: print "two_point says: computing BallTree for random sample" BT_R = BallTree(data_R) counts_DD = np.zeros(Nbins + 1) counts_RR = np.zeros(Nbins + 1) if verbose: print "two_point says: working through the CF calc. This could take a while" for i in range(Nbins + 1): counts_DD[i] = np.sum(BT_D.query_radius(data, bins[i], count_only=True)) if RR is None: counts_RR[i] = np.sum(BT_R.query_radius(data_R, bins[i], count_only=True)) if verbose: print "two_point says: binning done!" DD = np.diff(counts_DD) if RR is None: RR = np.diff(counts_RR) # check for zero in the denominator RR_zero = (RR == 0) RR[RR_zero] = 1 if method == 'standard': corr = factor**2 * DD / RR - 1 elif method == 'landy-szalay': counts_DR = np.zeros(Nbins + 1) for i in range(Nbins + 1): counts_DR[i] = np.sum(BT_R.query_radius(data, bins[i], count_only=True)) DR = np.diff(counts_DR) corr = (factor ** 2 * DD - 2 * factor * DR + RR) / RR corr[RR_zero] = np.nan to_return=corr if return_trees: to_return=[to_return] to_return.append(BT_D) to_return.append(BT_R) if return_RR: if not return_trees: to_return=[to_return] to_return.append(RR) if return_DD: if (not return_trees) and (not return_RR): to_return=[to_return] to_return.append(DD) return to_return
class Learner(object): """ A class that instantiates the feature space for an individual AI, chooses moves, and performs learning """ def __init__(self, data_points = None, ai_history = None, threshold = THRESHOLD): self.state_list = [] self.weights_list = [] if data_points is None: data_points = [] if ai_history is None: ai_history = [] for state, weights in data_points: assert(len(state) == 32) self.state_list.append(state) self.weights_list.append(weights) self._threshold = threshold self._ai_history = cp.deepcopy(ai_history) #self._featureTransform() self.X = np.array(self.state_list) assert(self.X.shape == (len(data_points), 32) or len(data_points) == 0) #Think about different distance metrics. Manhattan or minkowski? P < 1? if len(data_points) > 0: self._tree = BallTree(self.X, metric='manhattan') else: self._tree = None def getNextMove(self, current_board): # current_board.printBoard() nn_move = self._getNearestNeighbors(current_board) if nn_move is not None: next_move = nn_move else: next_move = self._getMinimax(current_board) self._ai_history.append(next_move) return next_move def updateWeights(self, game_history, status): if status == WIN: factor = WIN_FACTOR elif status == LOSE: factor = LOSE_FACTOR elif status == TIE: factor = 1 # old_board = Board() for _board, _move in game_history: assert(any(_move == mv[1] for mv in _board.getMoveList(_move.color))) if _move.color == AI_COLOR: state = _board.getArray().tolist() if state in self.state_list: i = self.state_list.index(state) # j = self.state_list[i].find(move) # print zip(*_board.getMoveList(AI_COLOR))[1] # print list(zip(*_board.getMoveList(AI_COLOR))[1]) j = list(zip(*_board.getMoveList(AI_COLOR))[1]).index(_move) self.weights_list[i][j] *= factor else: self.state_list.append(state) self.weights_list.append([1] * len(_board.getMoveList(AI_COLOR))) # print zip(*_board.getMoveList(AI_COLOR))[1] j = list(zip(*_board.getMoveList(AI_COLOR))[1]).index(_move) self.weights_list[-1][j] *= factor elif _move.color == PLAYER_COLOR: _move = _move.getInverse() state = _board.getInverse().getArray().tolist() if state in self.state_list: i = self.state_list.index(state) # j = self.state_list[i].find(move) j = list(zip(*_board.getInverse().getMoveList(AI_COLOR))[1]).index(_move) self.weights_list[i][j] *= (1.0 / factor) else: self.state_list.append(state) self.weights_list.append([1] * len(_board.getInverse().getMoveList(AI_COLOR))) j = list(zip(*_board.getInverse().getMoveList(AI_COLOR))[1]).index(_move) self.weights_list[-1][j] *= (1.0 / factor) self.X = np.array(self.state_list) self._tree = BallTree(self.X, metric='manhattan') def getAiHistory(self): return cp.deepcopy(self._ai_history) def _getMinimax(self, current_board): # return random.choice([bd[1] for bd in current_board.getMoveList(AI_COLOR)]) (bestBoard, bestVal) = minMax2(current_board, 6) # print("bestVal", bestVal) # bestBoard[0].printBoard() return bestBoard[1] def _getNearestNeighbors(self, current_board): #dist, ind = self._tree.query(current_board.getArray(), k=3) if self._tree is None: return None ind = self._tree.query_radius(current_board.getArray(), r = self._threshold).tolist() ind = ind[0].tolist() if len(ind) > 0: pass # print "neighbors found" #cur_moves = current_board.getMoveList(AI_COLOR) moves = [] weights = [] # print ind for i in ind: _board = Board(new_array = self.state_list[i]) assert(len(_board.getMoveList(AI_COLOR)) == len(self.weights_list[i])) for j, (board, move) in enumerate(_board.getMoveList(AI_COLOR)): # move.printMove() # current_board.printBoard() if current_board.verifyMove(AI_COLOR, move = move): # print "move found" # move.printMove() if move not in moves: moves.append(move) weights.append(self.weights_list[i][j]) else: weights[moves.index(move)] *= self.weights_list[i][j] if len(moves) == 0: # raise Exception() # print "aborted neighbors" return None else: assert(len(moves) == len(weights)) zipped = zip(moves, weights) moves = [mv[0] for mv in zipped if mv[1] >= 1] weights = [mv[1] for mv in zipped if mv[1] >= 1] if len(moves) < 1: return None return np.random.choice(moves, 1, weights)[0] #neighbor_moves = [move for move in neighbor_moves if move in cur_moves] def _featureTransform(self): #replace weights with a Gaussian at some point #or come up with a better feature transform weights = [1, 2, 3, 4, 4, 3, 2, 1] transformed_list = [] for state in self.state_list: assert(len(state) == 32) new_state = [] for i in range(32): new_state.append(state[i] * weights[i / 4]) transformed_list.append(new_state) self.X = np.array(transformed_list)
class SupportGrid: """Grid structure to support the computation of viewpoints. Grid structure to support the computation of viewpoints that will be used to detect the rho-boundary of a particle system which particle's positions are stored in the 'points' array. Attributes: points: A numpy array containing the position of the particles. rho: The value of rho, in general the h value from SPH simulations is a good approximation. dimension: The dimension of the particle system and the grid. cell_size: The length of the cells edges. aabb_min: The lower corner of the Axis Aligned Bounding Box containing the points. aabb_max: The upper corner of the Axis Aligned Bounding Box containing the points. grid_dims: The number of cells along each axis needed to compute the viewpoints, it includes some padding cells on each side. grid_min: The lower corner of the grid. grid_max: The upper corner of the grid. grid_count: A numpy array used to keep the number of points per cell. grid_elems: A numpy array containing lists of the indexes of the points inside each cell. tree: A KDTree structure used to simplify and speedup neighborhood queries. neighbor_cell_list: A numpy array with indexes in {-1, 0, 1} used to assist the traversal of neighboring cells in any dimension >= 1. """ def __init__(self, points, rho, dimension): """Constructor Initializes the grid and helper structures using the provided points and rho parameter. Args: points: A numpy array containing the coordinates of the particles. rho: Needed to compute the rho-boundary of the system. dimension: The dimension of the particle system. """ self.points = points self.rho = rho self.dimension = dimension self.cell_size = 2.0 * rho self.aabb_min = np.amin(points, axis=0) self.aabb_max = np.amax(points, axis=0) self.grid_dims = (self.aabb_max - self.aabb_min) / self.cell_size # Regarding the + 3: 1 for left side, 1 for right side, 1 for rounding # up self.grid_dims = np.trunc(self.grid_dims) + 3 self.grid_dims = self.grid_dims.astype(int) self.grid_min = self.aabb_min - self.cell_size self.grid_max = self.grid_min + self.grid_dims * self.cell_size self.grid_count = np.zeros(self.grid_dims, dtype=int) self.grid_elems = np.empty(self.grid_dims, dtype=object) self.update_grid() self.tree = NeighborsTree( self.points, leaf_size=10, metric='euclidean') self.neighbor_cell_list = self.compute_neighbor_cell_list() def update_grid(self): """Updates the grid with the counting and indexes. Updates the grid with the number of particles in each cell and puts the index of each particle in the corresponding cell. """ for i in range(self.points.shape[0]): pt = self.points[i] idx = (pt - self.grid_min) / self.cell_size idx = to_index_tuple(idx) self.grid_count[idx] += 1 if (self.grid_elems[idx] == None): self.grid_elems[idx] = [] self.grid_elems[idx].append(i) def compute_neighbor_cell_list(self): """Computes a list of offsets to the neighboring cells. Computes a list of offsets to the neighboring cells based on the dimension. This is used to simplify the traversal of neighbor cells in any dimension. For a 2D grid it produces: [[-1 -1], [-1 0], [-1 1], [0 -1], [0 0], [0 1], [1 -1], [1 0], [1 1]]. By using this list we can visit all the 9 cells around a point or cell with a single loop. Returns: A numpy array containing a list of offests to neighboring cells. """ previous = np.array([[-1], [0], [1]], dtype=int) current = None current_n_rows = 3 for c in range(1, self.dimension): ones = np.ones((current_n_rows, 1)) for i in range(-1, 2): temp = np.hstack((ones * i, previous)) if (current is None): current = temp else: current = np.vstack((current, temp)) current_n_rows *= 3 previous = current current = None return previous def get_viewpoints(self): """Computes and returns the viewpoints that will be used by the instances of the HPR operator. Computes and returns the viewpoints that will be used by the instances of the HPR operator. Empty cells neighboring non-empty cells get a viewpoint in its center; Non-empty cells that have no empty neighbor go through an additional step to generate viewpoints in cavity cells. Returns: A numpy array containing the viewpoints. """ self.viewpoints = [] # for i in range(self.grid_dims[0]): # for j in range(self.grid_dims[1]): # for k in range(self.grid_dims[2]): for cell in range(self.grid_dims.prod()): idx = np.unravel_index(cell, self.grid_dims) if (self.grid_count[idx] == 0): self.process_empty_cell(idx) else: self.process_nonempty_cell(idx) return self.viewpoints def process_empty_cell(self, idx): """Processes an empty cell and produces a viewpoint on its center. Processes an empty cell and produces a viewpoint on its center. The viewpoint is created only if the empty cell has a non-empty neighbor cell. Args: idx: The index of the cell. """ for i in range(self.neighbor_cell_list.shape[0]): n_idx = idx + self.neighbor_cell_list[i] # check grid limits if (np.any(np.less(n_idx, np.zeros([1, self.dimension]))) or np.any(np.greater_equal(n_idx, self.grid_dims))): continue n_idx = to_index_tuple(n_idx) # If there is a nonempty neighbor, we place a viewpoint # at the center of the current cell if (self.grid_count[n_idx] != 0): viewpoint = self.grid_min + \ np.array(idx) * self.cell_size + 0.5 * self.cell_size self.viewpoints.append(viewpoint) return def process_nonempty_cell(self, idx): """Processes an non-empty cell and produces viewpoints if possible. Processes an non-empty cell and produces a set of viewpoints based on the points inside the cell and its distribution. Args: idx: The index of the cell. """ # Check if there is an empty neighbor, # in this case the empty neighbor should be enough for i in range(self.neighbor_cell_list.shape[0]): n_idx = idx + self.neighbor_cell_list[i] # check grid limits if (np.any(np.less(n_idx, np.zeros([1, self.dimension]))) or np.any(np.greater_equal(n_idx, self.grid_dims))): continue n_idx = to_index_tuple(n_idx) if (self.grid_count[n_idx] == 0): return # Get everyone in the cell, and define a new viewpoint candidate, # based on its neighborhood centroid for i in range(self.grid_count[idx]): ii = self.grid_elems[idx][i] pt = self.points[ii] neighbors = self.tree.query_radius(pt.reshape(1,-1), r=2.0 * self.rho)[0] centroid = np.sum( self.points[neighbors], axis=0) / neighbors.shape[0] V = pt - centroid V = V / np.linalg.norm(V) viewpoint = pt + V * self.rho neighbors = self.tree.query_radius(viewpoint.reshape(1,-1), r=0.95 * self.rho)[0] if (neighbors.size == 0): self.viewpoints.append(viewpoint) def get_candidates(self, viewpoint): """Gets a set of points that are candidates to be marked as boundary. Gets a set of points that are candidates to be marked as boundary. These candidates are inside the local neighbohood of a viewpoint and will be used on the HPR operator. Args: viewpoint: The viewpoint that will be used by the HPR operator. Returns: A numpy array containing the boundary candidates around the viewpoint. """ return self.tree.query_radius(viewpoint.reshape(1,-1), r=4.0 * self.rho)[0]
def optics(X, eps=float('inf'), min_samples=1, metric='euclidean', extraction='hierarchical', ext_kwargs={}): """ Perform OPTICS clustering from vector array or distance matrix. Parameters ---------- X : array [n_samples, n_samples] or [n_samples, n_features] Array of distances between samples, or a feature array. The array is treated as a feature array unless the metric is given as 'precomputed'. eps : float, optional The generating distance between two samples for them to be considered as in the same neighborhood. min_samples : int, optional The number of samples in a neighborhood for a point to be considered as a core point. metric : string or callable, optional The metric to use when calculating distance between instances in a feature array. If metric is a string or callable, it must be one of the options allowed by metrics.pairwise.calculate_distance for its metric parameter. If metric is "precomputed", X is assumed to be a distance matrix and must be square. extraction : string, optional The extraction method used to generate clusters from the ordering of points returned by the OPTICS algorithm. ext_kwargs : dict Keyword arguments to be supplied to the extraction function. Returns ------- core_distances : array [n_samples] Core distance for each sample. ordering : array [n_samples] Indices of the samples in the order generated by OPTICS. reachability_distances : array [n_samples] Reachability distance for each sample. labels : array [n_samples] Cluster labels for each point. Noisy samples are given the label -1. Notes ----- See examples/cluster/plot_optics.py for an example. References ---------- Ankerst, Mihael, Markus M. Breunig, Hans-Peter Kriegel, and Jörg Sander. "OPTICS: ordering points to identify the clustering structure." ACM SIGMOD Record 28, no. 2 (1999): 49-60. """ X = atleast2d_or_csr(X) n = X.shape[0] if min_samples > n: raise ValueError('min_samples must be lower than the total number of samples') ordering = [] core_distances = np.ndarray(len(X)) # Initiate reachability distances to infinity reachability_distances = float('inf') * np.ones(n) # Set reachability for first point reachability_distances[0] = 0 # Construct spatial indexing structure if metric != 'precomputed': # TODO: Construct BallTree with the correct metric once the # metrics branch has been merged into master tree = BallTree(X, metric=metric) seeds = np.ones(n, dtype=bool) i = 0 while True: # Mark current point as processed seeds[i] = False # Add current point to the ordering ordering.append(i) if not any(seeds): break # Calculate core distance if metric == 'precomputed': D = X[i] core_dist = np.sort(D)[min_samples] else: core_dist = tree.query(X[i], min_samples+1)[0][0][-1] core_distances[i] = core_dist if core_dist <= eps: # Get the neighbors of the current point if metric == 'precomputed': neighbors = D[seeds] <= eps ds = D[neighbors] else: ind, dist = tree.query_radius(X[i], eps, True) si = seeds[ind[0]] neighbors = ind[0][si] ds = dist[0][si] cds = core_dist * np.ones(len(ds)) # Set the new reachability distances to # max(core_distance, distance) new_reach_dists = np.maximum(cds, ds) reachability_distances[neighbors] = new_reach_dists i = np.nonzero(seeds)[0][np.argmin(reachability_distances[seeds])] else: i = np.where(seeds)[0][0] if type(extraction) is str: estr = extraction.lower() if estr in EXTRACTION_FUNCTIONS: func = EXTRACTION_FUNCTIONS[estr] labels = func(ordering, reachability_distances, min_samples, **ext_kwargs) else: raise ValueError('Unknown Extraction Method: %s' % estr) else: raise TypeError('Extraction Method must be a string.') return core_distances, ordering, reachability_distances, labels
# 2.4.2) sum this minimum distance to tot # tot is the final distance between s0 and s2 # the s2 with minimum distance is the desired streamline np.random.seed(0) prototypes_id = np.random.permutation(dm.shape[0])[:200] dp = dm[:,prototypes_id] # dissimilarity projection kdt = BallTree(dp) # KDTree(dp) radius = 100 k = 10 sid = 9 idx1 = kdt.query_radius(dp[sid], radius)[0] # idx1 = kdt.query(dp[sid], k)[1][0] dm_small1 = dm[idx1][:,idx1] e1 = dm_small1[np.triu_indices(dm_small1.shape[0],1)] spgk = np.zeros(dm.shape[0]) for i in range(dm.shape[0]): idx2 = kdt.query_radius(dp[i], radius)[0] # idx2 = kdt.query(dp[i], k)[1][0] dm_small2 = dm[idx2][:,idx2] e2 = dm_small2[np.triu_indices(dm_small2.shape[0],1)] spgk[i] = np.multiply.outer(np.exp(-e1), np.exp(-e2)).sum() print i, spgk[i]
def variable_bw_mean_shift(X, bandwidth_array, seeds=None, max_iterations=300): """Variable bandwidth mean shift with gaussian kernel Parameters ---------- X : array-like, shape=[n_samples, n_features] Input data. bandwidth : array[float], shape=[n_samples] Kernel bandwidth. seeds : array[float, float], shape=(n_seeds, n_features), optional Point used as initial kernel locations. Default is setting each point in input data as a seed. max_iter : int, default 300 Maximum number of iterations, per seed point before the clustering operation terminates (for that seed point), if has not converged yet. Returns ------- cluster_centers : array, shape=[n_clusters, n_features] Coordinates of cluster centers. labels : array, shape=[n_samples] Cluster labels for each point. Notes ----- Code adapted from scikit-learn library. """ if not seeds: seeds = X n_points, n_features = X.shape stop_thresh = 1e-3 * np.mean(bandwidth_array) # when mean has converged center_intensity_dict = {} cluster_centers = [] ball_tree = BallTree(X) # to efficiently look up nearby points def gaussian_kernel(x, points, bandwidth): distances = euclidean_distances(points, x) weights = np.exp(-1 * (distances ** 2 / bandwidth ** 2)) return np.sum(points * weights, axis=0) / np.sum(weights) # For each seed, climb gradient until convergence or max_iterations for i, weighted_mean in enumerate(seeds): completed_iterations = 0 while True: points_within = X[ball_tree.query_radius([weighted_mean], bandwidth_array[i])[0]] old_mean = weighted_mean # save the old mean weighted_mean = gaussian_kernel(old_mean, points_within, bandwidth_array[i]) converged = extmath.norm(weighted_mean - old_mean) < stop_thresh if converged or completed_iterations == max_iterations: if completed_iterations == max_iterations: print("reached max iterations") cluster_centers.append(weighted_mean) center_intensity_dict[tuple(weighted_mean)] = len(points_within) break completed_iterations += 1 # POST PROCESSING: remove near duplicate points # If the distance between two kernels is less than the bandwidth, # then we have to remove one because it is a duplicate. Remove the # one with fewer points. sorted_by_intensity = sorted(center_intensity_dict.items(), key=lambda tup: tup[1], reverse=True) sorted_centers = np.array([tup[0] for tup in sorted_by_intensity]) unique = np.ones(len(sorted_centers), dtype=np.bool) ball_tree = BallTree(sorted_centers) for i, center in enumerate(sorted_centers): if unique[i]: neighbor_idxs = ball_tree.query_radius([center], np.mean(bandwidth_array))[0] unique[neighbor_idxs] = 0 unique[i] = 1 # leave the current point as unique cluster_centers = sorted_centers[unique] # ASSIGN LABELS: a point belongs to the cluster that it is closest to nbrs = NearestNeighbors(n_neighbors=1, algorithm="ball_tree").fit(cluster_centers) labels = np.zeros(n_points, dtype=np.int) distances, idxs = nbrs.kneighbors(X) labels = idxs.flatten() return cluster_centers, labels
train = int(total * TRAIN_PERCENTAGE) test = total - train distances = [] per_stops_popu = collections.defaultdict(int) distance = dist(lat, lon, stop_lat, stop_lon) if cnt < train: per_stops_popu[actual_stop] += 1 distances.append(distance) else: if cnt == train: idx = int(len(distances) * DISTANCE_FACTOR) if idx >= int(len(distances)): idx = -1 per_radius = sorted(distances)[idx] # global stops = tree.query_radius([lat, lon], r=RADIUS)[0] l = [(id_to_stop_id[s][0], dist(lat, lon, id_to_stop_id[s][1], id_to_stop_id[s][2]), id_to_stop_id[s][3], id_to_stop_id[s][4]) for s in stops] if actual_stop in get_largest_n(l, [1], LIST_SIZE): global_nearest += 1 if actual_stop in get_largest_n(l, [2], LIST_SIZE): global_route += 1 if actual_stop in get_largest_n(l, [3], LIST_SIZE): global_popu += 1 if distance > RADIUS: global_lost += 1 # personalized stops = tree.query_radius([lat, lon], r=per_radius)[0] l = [(id_to_stop_id[s][0], dist(lat, lon, id_to_stop_id[s][1], id_to_stop_id[s][2]), id_to_stop_id[s][3], id_to_stop_id[s][4], per_stops_popu[s]) for s in stops] if actual_stop in get_largest_n(l, [1], LIST_SIZE): per_nearest += 1 if actual_stop in get_largest_n(l, [2], LIST_SIZE):
def two_point(data, bins, method='standard', data_R=None, random_state=None): """Two-point correlation function Parameters ---------- data : array_like input data, shape = [n_samples, n_features] bins : array_like bins within which to compute the 2-point correlation. shape = Nbins + 1 method : string "standard" or "landy-szalay". data_R : array_like (optional) if specified, use this as the random comparison sample random_state : integer, np.random.RandomState, or None specify the random state to use for generating background Returns ------- corr : ndarray the estimate of the correlation function within each bin shape = Nbins """ data = np.asarray(data) bins = np.asarray(bins) rng = check_random_state(random_state) if method not in ['standard', 'landy-szalay']: raise ValueError("method must be 'standard' or 'landy-szalay'") if bins.ndim != 1: raise ValueError("bins must be a 1D array") if data.ndim == 1: data = data[:, np.newaxis] elif data.ndim != 2: raise ValueError("data should be 1D or 2D") n_samples, n_features = data.shape Nbins = len(bins) - 1 # shuffle all but one axis to get background distribution if data_R is None: data_R = data.copy() for i in range(n_features - 1): rng.shuffle(data_R[:, i]) else: data_R = np.asarray(data_R) if (data_R.ndim != 2) or (data_R.shape[-1] != n_features): raise ValueError('data_R must have same n_features as data') factor = len(data_R) * 1. / len(data) if sklearn_has_two_point: # Fast two-point correlation functions added in scikit-learn v. 0.14 KDT_D = KDTree(data) KDT_R = KDTree(data_R) counts_DD = KDT_D.two_point_correlation(data, bins) counts_RR = KDT_R.two_point_correlation(data_R, bins) else: warnings.warn("Version 0.3 of astroML will require scikit-learn " "version 0.14 or higher for correlation function " "calculations. Upgrade to sklearn 0.14+ now for much " "faster correlation function calculations.") BT_D = BallTree(data) BT_R = BallTree(data_R) counts_DD = np.zeros(Nbins + 1) counts_RR = np.zeros(Nbins + 1) for i in range(Nbins + 1): counts_DD[i] = np.sum(BT_D.query_radius(data, bins[i], count_only=True)) counts_RR[i] = np.sum(BT_R.query_radius(data_R, bins[i], count_only=True)) DD = np.diff(counts_DD) RR = np.diff(counts_RR) # check for zero in the denominator RR_zero = (RR == 0) RR[RR_zero] = 1 if method == 'standard': corr = factor ** 2 * DD / RR - 1 elif method == 'landy-szalay': if sklearn_has_two_point: counts_DR = KDT_R.two_point_correlation(data, bins) else: counts_DR = np.zeros(Nbins + 1) for i in range(Nbins + 1): counts_DR[i] = np.sum(BT_R.query_radius(data, bins[i], count_only=True)) DR = np.diff(counts_DR) corr = (factor ** 2 * DD - 2 * factor * DR + RR) / RR corr[RR_zero] = np.nan return corr
class AngularCatalog(object): """ This class is the workhorse of Py2PAC. It manages the actual catalogs of objects, it creates various objects to hold information, and it performs the correlation function calculations on the catalogs. AngularCatalogs are single-bin objects, so if you want to sub-divide your data set, do so before you pull it into AngularCatalogs. Future releases of Py2PAC will include a MultiCatalog that manages slicing a catalog into bins. Parameters ---------- ra : array-like A list of RAs for your objects in degrees dec : array-like A list of declinations for your objects in degrees generate_randoms : bool (optional) If True, ``__init__`` will call the mask's random generation to produce a random sample of size ``len(ra) * default_oversample``. If False, no randoms will be generated. Default is False. default_oversample : float (optional) The default number of randoms to make in units of the number of data points. If ``default_oversample==1``, then by default the object will generate the same number of randoms as you have data points. If ``default_oversample==1``, then by default the object will generate twice as many randoms as you have data points, etc. Default value is 1. properties : dictionary (optional) Any additional properties that you want to carry around with the angular positions. This isn't used at all by AngularCatalog, but makes it easier to access things. weight_file : string (optional) A path from / to a FITS weight file to be used to generate the ImageMask. image_mask : ImageMask instance (optional) An instance of an ImageMask object. Returns ------- cat : AngularCatalog instance The AngularCatalog instance with all the properties that you gave it. """ #------------------# #- Initialization -# #------------------#Your data and masked data will be the same def __init__(self, ra, dec, generate_randoms=False, default_oversample=1., properties=None, weight_file=None, image_mask=None): """ The init function for the AngularCatalog class """ #Make sure we have Numpy arrays ra = np.array(ra) dec = np.array(dec) #Check to make sure we have sensible values for RA and Dec if ra.ndim != 1: raise ValueError('RA list must be a 1D array') if dec.ndim != 1: raise ValueError('Dec list must be a 1D array') if dec.size != ra.size: raise ValueError('RA and Dec arrays must be the same length') #Now store the RA and Dec information self._ra = ra self._dec = dec self._ra_range = np.array([ra.min(), ra.max()]) self._ra_span = np.diff(self._ra_range)[0] self._dec_range = np.array([dec.min(), dec.max()]) self._dec_span = np.diff(self._dec_range)[0] self._input_n_objects = ra.size self._n_objects=None #Store the info from keywords self._image_mask = image_mask self._weight_file_name = weight_file self._properties = properties self._random_oversample = default_oversample #Store some defaults/holders self._theta_bins=None self._cfs={} #Make blank things so I can ask "is None" rather than "exists" self._data_tree=None self._random_tree=None self._ra_random=None self._dec_random=None self._Gp=None self._completeness=None self._use=None self._use_random=None self._subregion_number=None #Set up the mask and generate the randoms if asked self.setup_mask() if generate_randoms: self.generate_random_sample() #------------------------------------------------------------------------------------------ #--------------------------------------------# #- Class method for making a random catalog -# #--------------------------------------------# @classmethod def random_catalog(cls, n_randoms, image_mask = None, ra_range=None, dec_range=None): """ Creates an AngularCatalog populated with RAs and Decs placed randomly within the mask. This can be passed either an image mask or an RA and Dec range **Syntax** * cat = ac_class.AngularCatalog.random_catalog(n_randoms, image_mask=ImageMask_object) OR * cat = ac_class.AngularCatalog.random_catalog(n_randoms, ra_range=[min, max], dec_range=[min, max]) Parameters ---------- n_randoms : scalar The number of randoms that you want in you catalog image_mask : ImageMask object (optional) An ImageMask object with the outline that you want for your randoms. This is one option. ra_range : two-element array-like (optional) The minimum and maximum RA you would like your randoms to have. This is an alternative to the image_mask option. This must be combined with the dec_range argument as well. dec_range : two-element array-like (optional) The minimum and maximum Dec you would like your randoms to have. This is an alternative to the image_mask option. This must be combined with the ra_range argument. Returns ------- cat : AngularCatalog object An AngularCatalog instance with n_randoms distributed over either the image_mask or over the RA and Dec range. """ #Make an image mask from the RA and Dec ranges if we don't have an #image mask already need_image_mask = image_mask is None if need_image_mask: image_mask = imclass.ImageMask.from_ranges(ra_range, dec_range) #Use the ImageMask to create random RAs and Decs and make them into #an AngularCatalog with the corresponding mask. ra, dec, comp = image_mask.generate_random_sample(n_randoms) return AngularCatalog(ra, dec, image_mask=image_mask) #------------------------------------------------------------------------------------------ #----------------------------# #- Set the weight file name -# #----------------------------# def set_mask_to_weight_file(self, filename): """ Set the weight file name and process the file to an image mask Parameters ---------- filename : string The location of the FITS file that you want to process to a weight mask. The file name should be specified from / """ self._weight_file_name=filename self.setup_mask(force_remake=True) return #------------------------------------------------------------------------------------------ #-------------------------------------------# #- Make an image mask from the weight file -# #-------------------------------------------# def setup_mask(self, force_remake=False): #Create an image mask (from the weight file if given one) if (self._image_mask is None) or force_remake: if self._weight_file_name is not None: immask = imclass.ImageMask.from_FITS_weight_file(self._weight_file_name) else: immask = imclass.ImageMask.from_ranges(self._ra_range, self._dec_range) #Ask the mask for the completenesses of each data object self._completeness = self._image_mask.return_completenesses(self._ra, self._dec) #Generate random numbers- this is basically for when we have non-binary completeness compare_to = rand.random(size=self._n_objects) #Use the random numbers to figure out which guys in the data to use self._use = compare_to < self._completeness #Set up the data tree now that we have a mask self.make_data_tree() #Record how many objects we're actually using self._n_objects=len(self._ra[self._use]) #------------------------------------------------------------------------------------------ #-------------# #- Move mask -# #-------------# def move_mask(self, delta_ra=None, delta_dec=None, theta_degrees=None, preview=False): #Calls the image mask's translation/rotation routine. if preview: newmask=self._image_mask.move_mask_on_sky(delta_ra=delta_ra, delta_dec=delta_dec, theta_degrees=theta_degrees, preview=preview) return newmask else: self._image_mask.move_mask_on_sky(delta_ra=delta_ra, delta_dec=delta_dec, theta_degrees=theta_degrees, preview=preview) #------------------------------------------------------------------------------------------ #------------------------------------------------------------------------------------------ #---------------------------------# #- Compute BallTree for the data -# #---------------------------------# def make_data_tree(self): #The astroML correlation function methods want a cartesian position #instead of the angular positions- this does the conversion print "make_datatree says: Computing the BallTree for data." data = np.asarray(corr.ra_dec_to_xyz(self._ra[self._use], self._dec[self._use]), order='F').T self._data_tree = BallTree(data, leaf_size=2) return #------------------------------------------------------------------------------------------ #------------------------------------------# #- Compute BallTree for the random sample -# #------------------------------------------# def make_random_tree(self): #Make sure we have the random data made if (self._ra_random is None) or (self._dec_random is None): print "make_random_tree says: no random sample found. Generating one." self.generate_random_sample() #Make the tree print "make_randomtree says: Computing the BallTree for the randoms." random_data=np.asarray(corr.ra_dec_to_xyz(self._ra_random, self._dec_random), order='F').T self._random_tree = BallTree(random_data, leaf_size=2) return #------------------------------------------------------------------------------------------ def set_theta_bins(self, min_theta, max_theta, nbins, unit='a', logbins=True): #Make a ThetaBins class and save it. self._theta_bins = binclass.ThetaBins(min_theta, max_theta, nbins, unit=unit, logbins=logbins) #------------------------------------------------------------------------------------------ #---------------------------------------------------------------------# #- Check to make sure we have all the info needed for CF calculation -# #---------------------------------------------------------------------# def __check_cf_setup(self, need_subregions=False, random_oversample=None, check_trees=True): #Make sure that we have all the things we need to do a #correlation function properly (I got tired of the redundant #code in the different CF calculation routines) #Check that we have the bins if not isinstance(self._theta_bins, binclass.ThetaBins): raise ValueError("CF calculations need separation bins. Use " "catalog.set_theta_bins(min_theta, max_theta," "nbins, unit='arcsec', logbins=True)") #Change/store the random oversampling factor if it's given if random_oversample is not None: self._random_oversample=random_oversample #Check the existence of a random sample if self._ra_random is None: self.generate_random_sample() #See if we're properly oversampled. nR=len(self._ra_random) if nR != len(self._ra)*self._random_oversample: self.generate_random_sample() #Check to make sure we have the trees for the appropriate guys if check_trees: if self._data_tree is None: self.make_data_tree() if self._random_tree is None: self.make_random_tree() #Check to make sure that the subdivisions have happened #if need_subregions. If not, throw an error because it's #too specific to fill it in automatically if need_subregions: if self._subregion_number is None: raise ValueError("Jackknife and block bootstrap require " "that you subdivide the field. Call the " "catalog.subdivide_mask() routine first.") #------------------------------------------------------------------------------------------ #-----------------------------------------------------# #- Calculate the correlation function without errors -# #-----------------------------------------------------# def cf(self, estimator='landy-szalay', n_iter=1, clobber=False, random_oversample=None, save_steps_file=None, name='cf'): #This uses the info we have plus the astroML correlation package # to compute the angular correlation function. #The idea is that this function will figure out what information # is available and call the appropriate (most efficient) function # with all the relevant information. #This function will store the values it calculates for missing info if (name in self._cfs.keys()) and not clobber: raise ValueError("CorrelationFunction.cf says: There's already" " a CF by that name. Please choose another or " "overwrite by calling with clobber=True") #Make sure that we have everything we need and fix anything missing that's fixable self.__check_cf_setup(random_oversample=random_oversample, need_subregions=False, check_trees=True) #Make a new CorrelationFunction instance and set the basic info #First make a dictionary of the arguments to pass because it's ugly info={'name' : name, 'cf_type' : 'no_error', 'ngals' : self._n_objects, 'theta_bin_object' : copy.deepcopy(self._theta_bins), 'estimator' : estimator } self._cfs[name] = cfclass.CorrelationFunction(**info) centers, edges = self._cfs[name].get_thetas(unit='degrees') nbins=len(centers) #Do the calculation cf=np.zeros(nbins) DD=np.zeros(nbins) print "AngularCatalog.cf says: doing a CF calculation without error estimation" iterations={} for it in np.arange(n_iter): this_cf, this_dd = corr.two_point_angular(self._ra[self._use], self._dec[self._use], edges, BT_D=self._data_tree, BT_R=self._random_tree, method=estimator, ra_R=self._ra_random, dec_R=self._dec_random, return_DD=True) iterations[it]=this_cf cf += this_cf DD = this_dd/2. if save_steps_file is not None: self._cfs[name].set_cf(cf/(it+1), np.zeros(nbins), iterations=iterations) self._cfs[name].set_DD(DD) self.save_cf(save_steps_file, cf_keys=name) if n_iter >1: self.generate_random_sample() #Divide out the number of iterations cf/=n_iter #Make sure we've stored everything properly even if we're not saving self._cfs[name].set_cf(cf, np.zeros(nbins), iterations=iterations) #------------------------------------------------------------------------------------------ #----------------------------------------------------# #- Find the CF and error by single-galaxy bootstrap -# #----------------------------------------------------# def cf_bootstrap(self, n_boots=10, bootstrap_oversample=1, random_oversample=None, estimator='landy-szalay', save_steps_file=None, name='galaxy_bootstrap', clobber=False): #Calculate the correlation function with single-galaxy bootstrapping if (name in self._cfs.keys()) and not clobber: raise ValueError("CorrelationFunction.cf_bootstrap says: " "There's already a CF by that name. Please " "choose another or overwrite by calling with " "clobber=True") #Check that everything is set up self.__check_cf_setup(need_subregions=False, check_trees=False, random_oversample=random_oversample) #Make a new CorrelationFunction instance and set the basic info #First make a dictionary of the arguments to pass because it's ugly info={'name' : name, 'cf_type' : 'single_galaxy_bootstrap', 'ngals' : self._n_objects, 'theta_bin_object' : copy.deepcopy(self._theta_bins), 'estimator' : estimator } self._cfs[name] = cfclass.CorrelationFunction(**info) centers, edges = self._cfs[name].get_thetas(unit='degrees') nbins=len(centers) #Make an array so it's easy to average over the boots temp = np.zeros((n_boots, nbins)) #This RR will keep track of the RR counts so you don't have to #calculate them every time. rr=None #A holder for the boots that will be passed to the #CorrelationFunction as the iterations bootstrap_boots={} print ("AngularCatalog.cf_bootstrap says: doing a bootstrap " "CF calculation") #Loop through the boots for i in np.arange(n_boots): #Give a progress report print "calculating boot", i #Choose the right number of galaxies *with replacement* ind=np.random.randint(0, self._n_objects, bootstrap_oversample*self._n_objects) ra_b=self._ra[self._use][ind] dc_b=self._dec[self._use][ind] #Calculate this boot bootstrap_boots[i], rr = corr.two_point_angular(ra_b, dec_b, edges, BT_D=self._data_tree, BT_R=self._random_tree, method=estimator, ra_R=self._ra_random, dec_R=self._dec_random, RR=rr, return_RR=True) #Store what we have temp[i]=bootstrap_boots[i] if (save_steps_file is not None): bootstrap_cf=np.nanmean(temp[0:i+1], axis=0) bootstrap_cf_err=np.nanstd(temp[0:i+1], axis=0) self.save_cfs(save_steps_file, cf_keys=[name]) #Now we're done- do the final storage. bootstrap_cf=np.nanmean(temp, axis=0) bootstrap_cf_err=np.nanstd(temp, axis=0) self._cfs[name].set_cf(bootstrap_cf, bootstrap_cf_err, iterations=bootstrap_boots) self._cfs[name].set_counts(RR=rr) #------------------------------------------------------------------------------------------ #----------------------------------------# #- Find the CF and error by jackknifing -# #----------------------------------------# def cf_jackknife(self, ignore_regions=[], estimator='landy-szalay', random_oversample=None, save_steps_file=None, name='jackknife', clobber=False): #This takes a divided mask and performs the correlation #function calculation on the field with each sub-region #removed in turn. if (name in self._cfs.keys()) and not clobber: raise ValueError("CorrelationFunction.cf_jackknife says: " "There's already a CF by that name. Please " "choose another or overwrite by calling with " "clobber=True") #Check to make sure we have everything we need self.__check_cf_setup(need_subregions=True, check_trees=False, random_oversample=random_oversample) #Make a new CorrelationFunction instance and set the basic info #First make a dictionary of the arguments to pass because it's ugly info={'name' : name, 'cf_type' : 'jackknife', 'ngals' : self._n_objects, 'theta_bin_object' : copy.deepcopy(self._theta_bins), 'estimator' : estimator } self._cfs[name] = cfclass.CorrelationFunction(**info) centers, edges = self._cfs[name].get_thetas(unit='degrees') #pull out the unique subregion numbers and figure out which to use regions=np.asarray(list(set(self._subregion_number))) use_regions=[r for r in regions if (r not in ignore_regions) and (r != -1)] use_regions=np.array(use_regions) n_jacks=len(use_regions) #Figure out where the randoms are random_subregions=self._image_mask.return_subregions(self._ra_random, self._dec_random) #Now loop through the regions that you should be using #and calculate the correlation function leaving out each jackknife_jacks = {} #Make a mask that takes out all the galaxies that aren't in use_regions valid_subregion = ma.masked_not_equal(self._subregion_number, -1).mask random_valid_subregion=ma.masked_not_equal(random_subregions, -1).mask for bad_reg in ignore_regions: this_mask = ma.masked_not_equal(self._subregion_number, bad_reg).mask valid_subregion = valid_subregion & this_mask this_mask = ma.masked_not_equal(random_subregions, bad_reg).mask random_valid_subregion = random_valid_subregion & this_mask temp = np.zeros((n_jacks, len(self._cf_thetas))) for i, r in enumerate(use_regions): #Make the mask for the data not_region_r = ma.masked_not_equal(self._subregion_number, r).mask this_jackknife = valid_subregion & not_region_r & self._use #Make the mask for the randoms random_not_region_r = ma.masked_not_equal(random_subregions, r).mask random_this_jackknife = random_not_region_r & random_valid_subregion #Do the calculation for this jackknife and store it print "calculating jackknife", i jackknife_jacks[r] = corr.two_point_angular(self._ra[this_jackknife], self._dec[this_jackknife], edges, method=estimator, ra_R = self._ra_random[random_this_jackknife], dec_R = self._dec_random[random_this_jackknife]) temp[i]=jackknife_jacks[r] if (save_steps_file is not None): jackknife_cf=np.nanmean(temp[0:i+1], axis=0) jackknife_cf_err=np.nanstd(temp[0:i+1], axis=0) self._cfs[name].set_cf(jackknife_cf, jackknife_cf_err, iterations=bootstrap_boots) self.save_cfs(save_steps_file, cf_keys=[name]) #Now that we have all of the jackknifes (jackknives?), calculate the mean # and variance. jackknife_cf=np.nanmean(temp, axis=0) jackknife_cf_err=np.nanstd(temp, axis=0) self._cfs[name].set_cf(jackknife_cf, jackknife_cf_err, iterations=bootstrap_boots) #------------------------------------------------------------------------------------------ #--------------------------------------------# #- Find the CF and error by block bootstrap -# #--------------------------------------------# def cf_block_bootstrap(self, n_boots=10, ignore_regions=None, estimator='landy-szalay', random_oversample=None, bootstrap_oversample=1, save_steps_file=None, name='block_bootstrap', clobber=False): #Use the subdivided mask to bootstrap on blocks rather than #single galaxies. if (name in self._cfs.keys()) and not clobber: raise ValueError("CorrelationFunction.cf_block_bootstrap says: " "There's already a CF by that name. Please " "choose another or overwrite by calling with " "clobber=True") #Check to make sure I have everything that I need self.__check_cf_setup(masked=True, need_subregions=True, random_oversample=random_oversample, check_trees=False) #Make a new CorrelationFunction instance and set the basic info #First make a dictionary of the arguments to pass because it's ugly info={'name' : name, 'cf_type' : 'jackknife', 'ngals' : self._n_objects, 'theta_bin_object' : copy.deepcopy(self._theta_bins), 'estimator' : estimator } self._cfs[name] = cfclass.CorrelationFunction(**info) centers, edges = self._cfs[name].get_thetas(unit='degrees') nbins = len(centers) print "block boots done with setup" #Figure out which subregions we should be using regions=np.asarray(list(set(self._subregion_number))) use_regions=[r for r in regions if (r not in ignore_regions) and (r != -1)] use_regions=np.array(use_regions) #Figure out where the randoms are random_subregions=self._image_mask.return_subregions(self._ra_random, self._dec_random) #Make a dictionary of arrays containing the indices of the members of each sub-region we need indices={} random_indices={} for r in use_regions: indices[r]=np.where(self._subregion_number == r)[0] random_indices[r]=np.where(random_subregions == r)[0] #Loop through the bootstraps block_bootstrap_boots={} n_choose=len(use_regions)*bootstrap_oversample temp = np.zeros((n_boots, nbins)) print "block boots looping through boots" for i in np.arange(n_boots): this_boot=rand.choice(use_regions, size=n_choose) this_boot_indices=np.array([], dtype=np.int) this_boot_random_indices=np.array([], dtype=np.int) for region in this_boot: this_boot_indices=np.concatenate((this_boot_indices, indices[region])) this_boot_random_indices=np.concatenate((this_boot_random_indices, random_indices[region])) # this_boot_indices=np.array( print "calculating boot", i temp[i] = corr.two_point_angular(self._ra[this_boot_indices], self._dec[this_boot_indices], edges, method=estimator, ra_R=self._ra_random[this_boot_random_indices], dec_R=self._dec_random[this_boot_random_indices]) block_bootstrap_boots[i] = temp[i] cf=np.nanmean(temp[0:i+1], axis=0) cf_err=np.nanstd(temp[0:i+1], axis=0) self._cfs[name].set_cf(cf, cf_err, iterations=bootstrap_boots) if (save_steps_file is not None): self.save_cfs(save_steps_file, cfkeys=[name]) #------------------------------------------------------------------------------------------ #----------------------------------------------------------------# #- Generate the random-random counts required to compute the IC -# #----------------------------------------------------------------# def generate_rr(self, set_nbins=None, logbins=True, min_sep=0.01, force_n_randoms=None, save_to=None, n_chunks=1): #Do random-random counts over the entire field. If set_nbins is declared, #generate_rr will not go looking for the correlation functions so that the #RR counts for the IC calculation and the CF calculation can be done in parallel. #Figure out how many randoms we need. This was calculated by playing with #the number of randoms in the GOODS-S field and seeing when the RR counts converged #to the "way too many" curve. 27860 per 1.43e-5 steradians was what I settled on. #If there's a forced number, it will ignore my estimate. # Amendment 4/15- this minimum number seems to be somewhat too small for fields that # aren't as smooth as GOODS-S, so I'm multiplying it by 5. This looks ok. # Amendment 8/15- added the capability to do this in several chunks. if force_n_randoms is None: surface_density_required = 27860.*5./1.43e-5 area = self._image_mask.masked_area_solid_angle() number_needed = surface_density_required * area else: number_needed=force_n_randoms #If we're doing more than one chunk, divide the number we need into n_chunks chunks if n_chunks > 1: number_needed = np.ceil(float(number_needed)/n_chunks).astype(int) total_number = number_needed * n_chunks print "total number: ", total_number print "number per iteration: ", number_needed print "number of chunks: ", n_chunks #Range of separations to make bins over min_ra = self._ra[self._use].min() min_dec = self._dec[self._use].min() max_ra = self._ra[self._use].max() max_dec = self._dec[self._use].max() max_sep=misc.ang_sep(min_ra, min_dec, max_ra, max_dec, radians_in=False, radians_out=False) #Choose how many bins if set_nbins is None: #Get our theta bin info from the CF if we can. Error if we can't if self._theta_bins is None: raise ValueError("AngularCatalog.generate_rr says: I need" " either a set number of bins (set_nbins=N)" " or thetas from a CF to extrapolate. " " You have given me neither.") centers, edges = self._cfs[name].get_thetas(unit='degrees') nbins= np.ceil( len(centers) * 2. * max_sep/edges.max()) else: nbins=set_nbins #Make the bins rr_theta_bins = binclass.ThetaBins(min_sep, max_sep, nbins, unit='d', logbins=logbins) use_centers, use_theta_bins = rr_theta_bins.get_thetas(unit='degrees') #Do the loop G_p=np.zeros(nbins) rr_counts=np.zeros(nbins) for n_i in np.arange(n_chunks): print "doing chunk #", n_i #Remake the random sample so we're sure we have the right oversample factor self.generate_random_sample(masked=True, make_exactly=number_needed) #Code snippet shamelessly copied from astroML.correlations xyz_data = corr.ra_dec_to_xyz(self._ra_random, self._dec_random) data_R = np.asarray(xyz_data, order='F').T bins = corr.angular_dist_to_euclidean_dist(use_theta_bins) Nbins = len(bins) - 1 counts_RR = np.zeros(Nbins + 1) for i in range(Nbins + 1): counts_RR[i] = np.sum(self._random_tree.query_radius(data_R, bins[i], count_only=True)) rr = np.diff(counts_RR) #Landy and Szalay define G_p(theta) as <N_p(theta)>/(n(n-1)/2) G_p += rr/(number_needed*(number_needed-1)) rr_counts += rr print "Dividing out the theta bin sizes and number of chunks" #I divide out the bin width because just using the method #that L&S detail gives you a {G_p,i} with the property that #Sum[G_p,i]=1. This is not equivalent to Integral[G_p d(theta)]=1, #which is what they assume everywhere else. #Dividing out the bin width gives you that and lets you pretend #G_p is a continuous but chunky-looking function. G_p /= np.diff(use_theta_bins) G_p /= n_chunks self._rr_ngals=[total_number, n_chunks] self._Gp = gpclass.Gp(min_sep, max_sep, nbins, G_p, total_number, n_chunks, logbins=logbins, unit='d', RR=rr_counts) if save_to is not None: self.save_gp(save_to) #------------------------------------------------------------------------------------------ #-------------------------------------# #- Read in previously calculated CFs -# #-------------------------------------# def load_cf(self, filen, overwrite_existing=False, name_prefix=''): #Load in a CF from a file or set of files #First, what files start with filen? file_list = misc.files_starting_with(filen) nfiles = len(file_list) #Generate the names names = copy.copy(file_list) for i, n in names: names[i] = name_prefix + n.lstrip(filen) #------------------------------------------------------------------------------------------ #--------------------------------------------# #- Save the correlation functions to a file -# #--------------------------------------------# def save_cf(self, file_base, cf_keys=None): #Takes all the CF information we have and saves to a file #per CF #If they didn't say which ones specifically, save all if cf_keys is None: cf_keys=self._cfs.keys() for k in cf_keys: filen = file_base + k self._cfs[k].save(filen) #------------------------------------------------------------------------------------------ #-----------------------------------------------------------------# #- Read in previously calculated random-random counts for the IC -# #-----------------------------------------------------------------# def load_gp(self, filename, overwrite_existing=False): #Take the ASCII files with the normed random-random counts calculated and read it in if (self._Gp is None) or overwrite_existing: self._Gp = gpclass.Gp.from_file(filename) else: print ("angular_catalog.load_rr says: You've asked me not " "to overwrite the existing RR counts and there's " "already Gp information .") #------------------------------------------------------------------------------------------ #--------------------------------------------# #- Save the random-random counts for the IC -# #--------------------------------------------# def save_gp(self, filename): #If we have done the random-random counts for the integral #constraint, save to a file self._Gp.save(filename)
def tract_smooth(tractography, var, file_output): from sklearn.neighbors import BallTree var = float(var) std = var ** 2 points = tractography.original_tracts() all_points = numpy.vstack(points) bt = BallTree(all_points) N = len(all_points) / 3 I = numpy.eye(3)[None, ...] for i, tract in enumerate(tractography.original_tracts()): # all_points = numpy.vstack(points[:i] + points[i + 1:]) # bt = BallTree(all_points) diff = numpy.diff(tract, axis=0) diff = numpy.vstack((diff, diff[-1])) lengths = numpy.sqrt((diff ** 2).sum(1)) # cum_lengths = numpy.cumsum(lengths) diff_norm = diff / lengths[:, None] tangent_lines = diff_norm[:, None, :] * diff_norm[:, :, None] normal_planes = I - tangent_lines # weight_matrices = normal_planes + 1e10 * tangent_lines N = max(len(d) for d in bt.query_radius(tract, var * 3)) close_point_distances, close_point_indices = bt.query( tract, N ) close_points = all_points[close_point_indices] difference_vectors = close_points - tract[:, None, :] projected_vectors = ( normal_planes[:, None, :] * difference_vectors[..., None] ).sum(-2) projected_points = projected_vectors + tract[:, None, :] # projected_distances2 = (projected_vectors**2).sum(-1) # projected_weights = numpy.exp(- .5 * projected_distances2 / std) # projected_weights /= projected_weights.sum(-1)[:, None] weights = numpy.exp( -.5 * close_point_distances ** 2 / std )[..., None] weights /= weights.sum(-2)[..., None] # tract += (weights * projected_vectors).sum(-2) # weighted_distances = ( # weight_matrices[:, None, :] * # difference_vectors[..., None] # ).sum(-2) # weighted_distances *= difference_vectors # weighted_distances = weighted_distances.sum(-1) ** .5 # weighted_points = (projected_points * weights).sum(1) weighted_points = (projected_points * weights).sum(1) tract[:] = weighted_points # tract /= norm_term return Tractography( tractography.original_tracts(), tractography.original_tracts_data(), **tractography.extra_args )
def mean_shift(X, bandwidth=None, seeds=None, kernel="flat", max_cluster_radius=-1., max_iterations=300): """Perform MeanShift Clustering of data using the specified kernel Parameters ---------- X : array [n_samples, n_features] Input points to be clustered bandwidth : float, Kernel bandwidth seeds: array [n_seeds, n_features], optional Points used as initial kernel locations If not set, then use every point as a seed (which may be very slow---consider using the `get_bin_seeds` function to create a reduced set of seeds. max_cluster_radius: float, default -1. Used only in post-processing. If negative, then each point is clustered into its nearest cluster. If positive, then those points that are not within `max_cluster_radius` of any cluster center are said to be 'orphans' that do not belong to any cluster. Orphans are given cluster label -1. Returns ------- cluster_centers : array [n_clusters, n_features] Coordinates of cluster centers labels : array [n_samples] cluster labels for each point Notes ----- See examples/plot_meanshift.py for an example. """ if seeds is None: seeds = X elif len(seeds) == 0: raise ValueError, "If a list of seeds is provided it cannot be empty." if not (kernel in KERNELS): valid_kernels = " ".join(KERNELS) raise ValueError, "Kernel %s is not valid. Valid kernel choices are: %s " % (kernel, valid_kernels) # Set maximum neighbor query distance based on kernel if kernel in ["flat"]: query_distance = bandwidth kernel_update_function = flat_kernel_update print "Using flat kernel update" elif kernel in ["gaussian"]: query_distance = bandwidth * 3 # A bit arbitrary kernel_update_function = gaussian_kernel_update print "Using gaussian kernel update" else: raise ValueError, "Kernel %s not implemented correctly" % kernel n_points, n_features = X.shape stop_thresh = 1e-3 * bandwidth # when mean has converged center_intensity_dict = {} ball_tree = BallTree(X) # to efficiently look up nearby points # For each seed, climb gradient until convergence or max_iterations for weighted_mean in seeds: completed_iterations = 0 while True: # Find mean of points within bandwidth points_within = X[ball_tree.query_radius([weighted_mean], query_distance)[0]] if len(points_within) == 0: break # Depending on seeding strategy this condition may occur old_mean = weighted_mean # save the old mean weighted_mean = kernel_update_function(old_mean, points_within, bandwidth) # If converged or at max_iterations, addS the cluster if extmath.norm(weighted_mean - old_mean) < stop_thresh or \ completed_iterations == max_iterations: center_intensity_dict[tuple(weighted_mean)] = len(points_within) break completed_iterations += 1 # POST PROCESSING: remove near duplicate points # If the distance between two kernels is less than the bandwidth, # then we have to remove one because it is a duplicate. Remove the # one with fewer points. print "%d clusters before removing duplicates " % len(center_intensity_dict) sorted_by_intensity = sorted(center_intensity_dict.items(), key=lambda tup: tup[1], reverse=True) sorted_centers = np.array([tup[0] for tup in sorted_by_intensity]) unique = np.ones(len(sorted_centers), dtype=np.bool) cc_tree = BallTree(sorted_centers) for i, center in enumerate(sorted_centers): if unique[i]: neighbor_idxs = cc_tree.query_radius([center], bandwidth)[0] unique[neighbor_idxs] = 0 unique[i] = 1 # leave the current point as unique cluster_centers = sorted_centers[unique] print "%d clusters after removing duplicates " % len(cluster_centers) # ASSIGN LABELS: a point belongs to the cluster that it is closest to centers_tree = BallTree(cluster_centers) labels = np.zeros(n_points, dtype=np.int) distances, idxs = centers_tree.query(X, 1) if max_cluster_radius < 0: labels = idxs.flatten() else: labels[:] = -1 bool_selector = distances.flatten() <= max_cluster_radius labels[bool_selector] = idxs.flatten()[bool_selector] return cluster_centers, labels
def evaluate_embeddings(embeddings, edges, cda=True, greedy_routing=False, cda_max_vertices=1000, gr_max_pairs=10000): "evaluate quality of embeddings compared with real elge set" report = [] # get connected component true_graph = nx.Graph() true_graph.add_edges_from(edges) Gcc=sorted(nx.connected_component_subgraphs(true_graph), key=len, reverse=True) true_graph=Gcc[0] # use BallTree for efficient graph construction print "construct BallTree" vertices = list(true_graph.nodes()) n = len(vertices) embeddings_array = np.array([embeddings[v] for v in vertices]) bt = BallTree(embeddings_array, metric=distance) degrees = defaultdict(int) print "compute number of correct directed arcs" for v1, v2 in edges: degrees[v1] += 1 degrees[v2] += 1 # compute number of correct DIRECTED arcs assuming that degrees are known if cda: all_correct_arcs = set() cda_vertices = vertices[:] if len(cda_vertices) > cda_max_vertices: np.random.shuffle(cda_vertices) cda_vertices = cda_vertices[:cda_max_vertices] for v_i, v in enumerate(cda_vertices): start = time.time() degree = degrees[v] dist, ind = bt.query(embeddings[v], k=degree+1) # one of neighbors is vertex inself neigh = [vertices[i] for i in ind[0].tolist() if vertices[i] != v] for ne in neigh: if make_edge(v, ne) in edges: all_correct_arcs.add((v, ne)) finish = time.time() #print "DEBUG: {} / {}, time={}s".format(v_i + 1, len(cda_vertices), datetime.timedelta(seconds=finish-start)) report.append(['ratio of correct arcs for known degrees', float(len(all_correct_arcs)) / (2 * len(edges))]) if greedy_routing: print "compute greedy routing efficiency" random_pairs = set() if n * (n-1) / 2 <= gr_max_pairs: random_pairs = set(combinations(vertices, 2)) else: while(len(random_pairs) < gr_max_pairs): v1 = np.random.choice(vertices) v2 = np.random.choice(vertices) if v1 != v2: random_pairs.add((v1, v2)) total_distribution = defaultdict(int) success_distribution = defaultdict(int) for i, pair in enumerate(random_pairs): src, dst = pair # best path best_path_length = nx.shortest_path_length(true_graph, source=src, target=dst) total_distribution[0] += 1 total_distribution[best_path_length] += 1 # greedy path curr_src = src path_length = 0 seen = set() while curr_src != dst: seen.add(curr_src) # find neighbor closest to destination unseen_neighbors = filter(lambda x: x not in seen, true_graph.neighbors(curr_src)) if not len(unseen_neighbors): # greedy algorithm stuck in 'leaf' path_length = np.nan break def curr_distance(v): return distance(embeddings[dst], embeddings[v]) closest_neigh = min(unseen_neighbors, key=curr_distance) path_length += 1 curr_src = closest_neigh if path_length == best_path_length: success_distribution[0] += 1 success_distribution[best_path_length] += 1 all_success = success_distribution[0] all_total = total_distribution[0] all_ratio = float(all_success) / all_total * 100 print "Total: {} / {} ({:.2f} %)".format(all_success, all_total, all_ratio) for pl in sorted(set(total_distribution.keys()) | set(success_distribution.keys())): if pl == 0: continue total = total_distribution.get(pl, 0) success = success_distribution.get(pl, 0) ratio = float(success) / total * 100 print "Path length = {}: {} / {} ({:.2f} %)".format(pl, success, total, ratio) if False: # depends on R, bad for subgraphs -- not used n = len(vertices) R = 2 * np.log(n) coshR = np.cosh(R) predicted_edges = set() print "predict edges" for v in vertices: coords = embeddings[v] neigh_idx = bt.query_radius(coords, R) neigh = [vertices[i] for i in neigh_idx[0].tolist() if vertices[i] != v] predicted_edges.update([make_edge(v, ne) for ne in neigh]) report.append(['total_predicted_edges', len(predicted_edges)]) # contingency matrix print "compute contingency matrix" report.append(['true positive', len(edges & predicted_edges)]) report.append(['false positive', len(predicted_edges - edges)]) report.append(['false negative', len(edges - predicted_edges)]) report.append(['true negative', n*(n-1)/2 - len(edges | predicted_edges)]) return report
r_vir=0.184*(mass/1e12)**.33 * (omega_z/0.28)**0.33 # r_virial of halos from NFW formulation candidates=((mass>2e11)).nonzero()[0] # cut for the minimum mass of the halos considered in the CS model, here m_{h,0}=2e11 M_sun rand_ids=random.sample(candidates,100000) # sample a fraction of the candidiates, for example chosing 100,000 halos #making data tree data=np.array([x,y,z]).T data_tree=BallTree(data) #searching the data tree finding halos in CS model with DR_0 =1. DR0=1. good_dr=[] for n in np.arange(0,len(rand_ids)): halo=rand_ids[n] #searching within 10 Mpc of halos halo_neighbors=data_tree.query_radius(data[halo,:], 10., count_only = False, return_distance = False) concat_halos=np.concatenate(halo_neighbors) pruned_halos=(mass[concat_halos]>.5*mass[halo]).nonzero()[0] SB=concat_halos[pruned_halos] if (len(SB)<2): # if no halo is found within 10 Mpc radius, analyze the next halo candidate continue d=np.ones(len(SB))*1000 i=0 for sample in SB: d[i]=( (x[sample]-x[halo])**2+(y[sample]-y[halo])**2 + (z[sample]-z[halo])**2 )**0.5 d[i]/=r_vir[sample] i+=1 DR=min(d[d>0]) exc=((d<1)&(d>0)).nonzero()[0] MM=mass[SB[exc]] MM_ind=(MM>1e13).nonzero()[0]