Exemple #1
0
def get_nearest(src_points, candidates, k_neighbors=1):
    """Find nearest neighbors for all source points from a set of candidate points"""

    # Create tree from the candidate points
    tree = BallTree(candidates, leaf_size=15, metric='haversine')

    # Find closest points and distances
    distances, indices = tree.query(src_points, k=k_neighbors)

    # Transpose to get distances and indices into arrays
    distances = distances.transpose()
    indices = indices.transpose()

    # Get closest indices and distances (i.e. array at index 0)
    # note: for the second closest points, you would take index 1, etc.
    #    closest = indices[0]
    #    closest_dist = distances[0]
    closest = indices
    closest_dist = distances

    # Return indices and distances
    return (closest, closest_dist)
Exemple #2
0
    def get_nearest(src_points, candidates, k_neighbors=1):
        # get_nearest and nearest_neighbor functions sourced from the following site:
        # https://automating-gis-processes.github.io/site/notebooks/L3/nearest-neighbor-faster.html
        """Find nearest neighbors for all source points from a set of candidate points"""
        print('balltree get nearest function - hsl')
        # Create tree from the candidate points
        tree = BallTree(candidates, leaf_size=15, metric='haversine')

        # Find closest points and distances
        distances, indices = tree.query(src_points, k=k_neighbors)

        # Transpose to get distances and indices into arrays
        distances = distances.transpose()
        indices = indices.transpose()

        # Get closest indices and distances (i.e. array at index 0)
        # note: for the second closest points, you would take index 1, etc.
        closest = indices[0]
        closest_dist = distances[0]

        # Return indices and distances
        return (closest, closest_dist)
Exemple #3
0
 def train(self):
     R = (self.m**2 + self.n**2)**0.5
     for it in range(self.max_iter):
         for sample in self.data:
             win_m, win_n, max_sim = self.find_winner(sample)
             neighbor = self.find_neighbor(win_m, win_n, R)
             for w in neighbor:
                 mw = w[0]
                 nw = w[1]
                 rw = w[2]
                 self.weights[mw, nw] += self.learning_rate(
                     it, rw) * (sample - self.weights[mw, nw])
             R *= 1 - (it + 1) / self.max_iter
     data_tree = BallTree(self.data)
     for mi in range(self.m):
         for ni in range(self.n):
             dist, idx = data_tree.query([self.weights[mi, ni]], k=10)
             vote = [self.labels[i] for i in idx.reshape(-1)]
             self.output[mi, ni] = int(
                 sorted(dict(Counter(vote)).items(),
                        key=lambda d: d[1],
                        reverse=True)[0][0])
def _rsl_prims_balltree(X, k=5, alpha=1.4142135623730951, metric='euclidean',
                        **kwargs):

    # The Cython routines used require contiguous arrays
    if not X.flags['C_CONTIGUOUS']:
        X = np.array(X, dtype=np.double, order='C')

    dim = X.shape[0]
    k = min(dim - 1, k)

    tree = BallTree(X, metric=metric, **kwargs)

    dist_metric = DistanceMetric.get_metric(metric, **kwargs)

    core_distances = tree.query(X, k=k)[0][:, -1].copy(order='C')
    min_spanning_tree = mst_linkage_core_vector(X, core_distances, dist_metric,
                                                alpha)

    single_linkage_tree = label(min_spanning_tree)
    single_linkage_tree = SingleLinkageTree(single_linkage_tree)

    return single_linkage_tree
Exemple #5
0
def estimate_bayes_factor(trace, r=0.05, return_list=False):
    """Estimate the bayes factor using the local density of points"""

    # Convert traces to a numpy array, ignore the intervals
    trace_arr = np.array([trace[i] for i in trace.varnames if "_interval__" not in i])
    trace_t = trace_arr.T
    N_iter, D = trace_t.shape

    # compute volume of a D-dimensional sphere of radius r
    Vr = np.pi ** (0.5 * D) / gamma(0.5 * D + 1) * (r ** D)

    # use neighbor count within r as a density estimator
    bt = BallTree(trace_t)
    count = bt.query_radius(trace_t, r=r, count_only=True)

    BF = trace.model_logp + np.log(N_iter) + np.log(Vr) - np.log(count)

    if return_list:
        return BF
    else:
        p25, p50, p75 = np.percentile(BF, [25, 50, 75])
        return p50, 0.7413 * (p75 - p25)
Exemple #6
0
def make_nearest_surf(center,
                      radius,
                      rotation,
                      contour_pts,
                      psize=20,
                      qsize=8,
                      vis=False,
                      seg=None):

    points = np.array([[
        radius[0] * math.cos(u) * math.cos(v),
        radius[1] * math.cos(v) * math.sin(u), radius[2] * math.sin(v)
    ] for u in np.linspace(0, 2 * math.pi, num=psize) for v in np.linspace(
        -math.pi / 2 + 0.01, math.pi / 2 - 0.01, num=psize)])
    for i in range(len(points)):
        points[i] = np.dot(points[i], rotation)
    points += center

    tree = BallTree(contour_pts)
    _, ind = tree.query(points, k=1)
    ind = np.reshape(ind, (ind.shape[0]))

    points = contour_pts[ind, :].astype(np.float64)
    noise = 0.001
    points += np.random.rand(points.shape[0], points.shape[1]) * noise

    if vis:
        img_mask = get_image_mask_points(seg, points)

        color_img = draw_segmentation(seg, img_mask, mark_val=255)
        show_ct_image(color_img)

    return approximate_surface(points.tolist(),
                               psize,
                               psize,
                               3,
                               3,
                               ctrlpts_size_u=qsize,
                               ctrlpts_size_v=qsize)
Exemple #7
0
def calc_nearest_site():
    # Now we are going to use sklearn's KDTree to find the nearest neighbor of
    # each center for the nearest port.
    points_of_int = np.radians(
        df_centers.loc[:, ['average_lat', 'average_lon']].values)
    candidates = np.radians(ports_wpi.loc[:, ['lat', 'lon']].values)
    tree = BallTree(candidates, leaf_size=30, metric='haversine')
    ports_wpi = get_sites(engine)
    nearest_list = []
    for i in range(len((points_of_int))):
        dist, ind = tree.query(points_of_int[i, :].reshape(1, -1), k=1)
        nearest_dict = {
            clust_id_value: df_centers.iloc[i].loc[clust_id_value],
            'nearest_site_id': ports_wpi.iloc[ind[0][0]].loc['port_id'],
            'nearest_port_dist': dist[0][0] * 6371.0088
        }
        nearest_list.append(nearest_dict)
    df_nearest = pd.DataFrame(nearest_list)
    df_centers = pd.merge(df_centers,
                          df_nearest,
                          how='left',
                          on=clust_id_value)
Exemple #8
0
def test_index():
    xs = rand(1000, 100, random_state=42).toarray()

    try:
        indexer = SQLiteIndexer(index_path=INDEX_PATH)
        index = PrioritizedDynamicContinuousIndex(indexer,
                                                  composite_indices=2,
                                                  simple_indices=50)
        index.fit(xs)

        x = xs[0:1]
        k = 10

        nn_baseline = BallTree(xs)

        baseline_dist, baseline_idx = nn_baseline.query(x, k=k)
        dist, idx = index.query(x, k=k)

        # np.testing.assert_equal(baseline_idx[0], idx)
    finally:
        if os.path.exists(INDEX_PATH):
            os.remove(INDEX_PATH)
Exemple #9
0
def nne(dim_red, true_labels):
    """
    Calculates the nearest neighbor accuracy (basically leave-one-out cross
    validation with a 1NN classifier).

    Args:
        dim_red (array): dimensions (k, cells)
        true_labels (array): 1d array of integers

    Returns:
        Nearest neighbor accuracy - fraction of points for which the 1NN
        1NN classifier returns the correct value.
    """
    # use sklearn's BallTree
    bt = BallTree(dim_red.T)
    correct = 0
    for i, l in enumerate(true_labels):
        dist, ind = bt.query([dim_red[:, i]], k=2)
        closest_cell = ind[0, 1]
        if true_labels[closest_cell] == l:
            correct += 1
    return float(correct) / len(true_labels)
def get_score_for_ideal_points(c, ideal_points, IDEAL_RADIUS, IDEAL_HEIGHT):
    #rename cameras
    rename_cameras(c)

    #get normalized points of cameras currently aligned
    points = get_normalized_points(c, IDEAL_RADIUS)

    #get translation and rotation vector

    #get model, scene and after non rigid points
    model, scene, after_tps = cca.non_rigid_registration(points, ideal_points)

    #save_points_like_obj(model, "D:/model{}.obj".format(counter))
    #save_points_like_obj(scene, "D:/scene{}.obj".format(counter))
    #save_points_like_obj(after_tps, "D:/after_tps{}.obj".format(counter))

    distances_array = []

    ballTree = BallTree(after_tps)
    #for dooblicator v1 46 min distance between cameras is height/2
    if len(c.cameras) >= 41 and len(c.cameras) <= 51:
        radius = 2 * (IDEAL_HEIGHT / 2) / 3
    else:
        radius = 2 * IDEAL_HEIGHT / 3
    not_functional = []
    i = 0
    for point in ideal_points:
        ind = ballTree.query_radius(point, radius)
        if len(ind[0]) == 1:
            distances_array.append(np.linalg.norm(point -
                                                  after_tps[ind[0][0]]))
        else:
            i += 1
            distances_array.append(1000)

    print("SCORE: ", np.mean(distances_array))

    return np.mean(distances_array)
def image_retrieval():
    topK = 10
    avg_acc = 0

    x_train_noisy, x_test_noisy, y_train, y_test, x_train, x_test = preprocess(
    )
    autoencoder = load_model('../working/autoencoder.h5')
    print(autoencoder.summary())
    encoder = Model(autoencoder.input,
                    autoencoder.get_layer('encoding_layer').output)

    coded_train = encoder.predict(x_train_noisy)
    coded_train = coded_train.reshape(
        coded_train.shape[0],
        coded_train.shape[1] * coded_train.shape[2] * coded_train.shape[3])
    coded_train = preprocessing.normalize(coded_train, norm='l2')

    tree = BallTree(coded_train, leaf_size=200)

    #extracting features from test set
    coded_test = encoder.predict(x_test_noisy)
    coded_test = coded_test.reshape(
        coded_test.shape[0],
        coded_test.shape[1] * coded_test.shape[2] * coded_test.shape[3])
    coded_test = preprocessing.normalize(coded_test, norm='l2')

    for i in range(coded_test.shape[0]):
        query_code = coded_test[i]
        query_label = y_test[i]
        dists, ids = tree.query([query_code], k=topK)
        labels = np.array([y_train[id] for id in ids[0]])

        acc = (labels == query_label).astype(int).sum() / topK
        avg_acc += acc
        if i % 1000 == 0:
            print('{} / {}: {}'.format(i, coded_test.shape[0], acc))
    avg_acc /= coded_test.shape[0]
    print("The average top K accuracy is: {}".format(avg_acc))
Exemple #12
0
def test_barnes_hut_angle():
    # When Barnes-Hut's angle=0 this corresponds to the exact method.
    angle = 0.0
    perplexity = 10
    n_samples = 100
    for n_components in [2, 3]:
        n_features = 5
        degrees_of_freedom = float(n_components - 1.0)

        random_state = check_random_state(0)
        distances = random_state.randn(n_samples, n_features)
        distances = distances.astype(np.float32)
        distances = abs(distances.dot(distances.T))
        np.fill_diagonal(distances, 0.0)
        params = random_state.randn(n_samples, n_components)
        P = _joint_probabilities(distances, perplexity, verbose=0)
        kl_exact, grad_exact = _kl_divergence(params, P, degrees_of_freedom,
                                              n_samples, n_components)

        k = n_samples - 1
        bt = BallTree(distances)
        distances_nn, neighbors_nn = bt.query(distances, k=k + 1)
        neighbors_nn = neighbors_nn[:, 1:]
        distances_nn = np.array([distances[i, neighbors_nn[i]]
                                 for i in range(n_samples)])
        assert np.all(distances[0, neighbors_nn[0]] == distances_nn[0]),\
            abs(distances[0, neighbors_nn[0]] - distances_nn[0])
        P_bh = _joint_probabilities_nn(distances_nn, neighbors_nn,
                                       perplexity, verbose=0)
        kl_bh, grad_bh = _kl_divergence_bh(params, P_bh, degrees_of_freedom,
                                           n_samples, n_components,
                                           angle=angle, skip_num_points=0,
                                           verbose=0)

        P = squareform(P)
        P_bh = P_bh.toarray()
        assert_array_almost_equal(P_bh, P, decimal=5)
        assert_almost_equal(kl_exact, kl_bh, decimal=3)
Exemple #13
0
def put_zalando_stuff_in_db():
	"""Puts the downloaded Zalando stuff into the database etc"""

	feat_extr = FeatureExtr()

	pic_list = []
	X = []

	for file in os.listdir(TRAIN_IMG_PATH):
		if os.path.isdir(TRAIN_IMG_PATH + file):
			with open(TRAIN_IMG_PATH + file + "/data.txt") as data_file:
				prod_data = json.load(data_file)

			prd = Product(name=prod_data['name'], brand=prod_data['brand']["name"], external_id=prod_data['id'],
			              price=prod_data["units"][0]["price"]["value"], display_img_path="")

			prd.save()

			for img_file in prod_data['media']['images']:
				img_name = img_file['mediumUrl'].split('/')[-1]
				img_path = TRAIN_IMG_PATH + file + "/" + img_name

				img_orig = (imread(img_path)[:, :, :3]).astype(np.float32)
				img_resize = imresize(img_orig, (227, 227))

				img_feat = feat_extr.get_features([img_resize])[0]
				norm_img_feat = img_feat / np.linalg.norm(img_feat)

				prd.picture_set.create(img_type=img_file['type'], img_path=img_path, feature_array=norm_img_feat)

				pic_list.append((prd.external_id, norm_img_feat, img_path))
				X.append(norm_img_feat)


	kdt = BallTree(X, leaf_size=30, metric='euclidean')

	pickle.dump(pic_list, open(NEAREST_NEIGH_PATH + "pic_list.p", "wb"))
	pickle.dump(kdt, open(NEAREST_NEIGH_PATH + "tree.p", "wb"))
Exemple #14
0
def _hdbscan_boruvka_balltree(X,
                              min_samples=5,
                              alpha=1.0,
                              metric='minkowski',
                              p=2,
                              leaf_size=40,
                              approx_min_span_tree=True,
                              gen_min_span_tree=False,
                              core_dist_n_jobs=4,
                              **kwargs):
    if leaf_size < 3:
        leaf_size = 3

    if core_dist_n_jobs < 1:
        core_dist_n_jobs = max(cpu_count() + 1 + core_dist_n_jobs, 1)

    if X.dtype != np.float64:
        X = X.astype(np.float64)

    tree = BallTree(X, metric=metric, leaf_size=leaf_size, **kwargs)
    alg = BallTreeBoruvkaAlgorithm(tree,
                                   min_samples,
                                   metric=metric,
                                   leaf_size=leaf_size // 3,
                                   approx_min_span_tree=approx_min_span_tree,
                                   n_jobs=core_dist_n_jobs,
                                   **kwargs)
    min_spanning_tree = alg.spanning_tree()
    # Sort edges of the min_spanning_tree by weight
    min_spanning_tree = min_spanning_tree[
        np.argsort(min_spanning_tree.T[2]), :]
    # Convert edge list into standard hierarchical clustering format
    single_linkage_tree = label(min_spanning_tree)

    if gen_min_span_tree:
        return single_linkage_tree, min_spanning_tree
    else:
        return single_linkage_tree, None
Exemple #15
0
    def __init__(self,
                 pointings,
                 raCol='_ra',
                 decCol='_dec',
                 indexCol='obsHistID',
                 leafSize=50):
        """
        Create a tree of pointings

        Parameters
        ----------
        pointings : `pd.dataFrame` 
            of pointings with unique index values as the index column
        raCol :  string
            column name for a column holding ra values in radians
        decCol :  string
            column name for a column holding dec values in radians

        .. note : raCol and decCol are assumed to hold ra and dec in units of
        radians
        """
        self.pointings = pointings

        if self.validatePointings(pointings, raCol, decCol):
            self.raCol = raCol
            self.decCol = decCol
        else:
            raise ValueError('pointings, and the provided values of raCol, decCol {0}, {1} are incompatible'.format(raCol, decCol))

        # tree queries
        # Keep mapping from integer indices to obsHistID
        pointings.loc[:, 'intindex'] = np.arange(len(pointings)).astype(np.int)
        self.indMapping = pointings['intindex'].reset_index().set_index('intindex')

        # Build Tree
        self.tree = BallTree(pointings[[decCol, raCol]].values,
                             leaf_size=leafSize,
                             metric='haversine')
Exemple #16
0
    def nn_search(self,
                  tree_features,
                  query_features,
                  metric='haversine',
                  convert_radians=False):
        '''
        Build a BallTree for nearest neighbor search based on haversine distance.

        Parameters
        ----------

        tree_features: array_like
                       Input features to create the search tree. Features are in
                       lat, lon format, in radians

        query_features: array_like
                        Points to which calculate the nearest neighbor within the tree.
                        latlon coordinates expected in radians for distance calculation

        metric: str
                Distance metric for neighorhood search. Default haversine for latlon coordinates.

        convert_radians: bool
                         Flag in case features are not in radians and need to be converted

        Returns
        -------

        distances: array_like
                   Array with the corresponding distance in km (haversine distance * earth radius)

        '''

        if convert_radians:
            pass

        tree = BallTree(tree_features, metric=metric)
        return tree.query(query_features)[0] * 6371000 / 1000
def distance_to_port(lon, lat, ports):
    '''
    Take longitude and latitude and return the distance (km) to the
    closest port, as well as the country of that port, using the World
    Port Index database. This uses a ball tree search approach in
    radians, accounting for the curvature of the Earth by calculating
    the Haversine metric for each pair of points. Note that Haversine
    distance metric expects coordinate pairs in (lat, long) order,
    in radians.
    Arguments:
    lon, lat: Arrays of longitude-latitude pairs of ship locations, in degrees
    ports: shape file of ports
    Returns:
    Pandas dataframe with columns 'shore_country' and 'distance_to_port'
    '''
    ports_flip = np.flip(ports, axis=1)
    coords = pd.concat([np.radians(lat), np.radians(lon)], axis=1)
    tree = BallTree(np.radians(ports_flip), metric='haversine')
    dist, ind = tree.query(coords, k=1)
    df_distance_to_port = pd.Series(
        dist.flatten() * 6371,  # radius of earth (km)
        name='distance_to_port')
    return df_distance_to_port
    def __init__(
            self,
            k,
            train_X,
            train_Y,
            n_components=5,
            weights=[1, 1],  # must be above 1
            threshold=0.9):

        self.train_X = np.asarray(train_X)
        # Use BallTree to optimize neighbour searches
        # Is O(m log n) instead of O(m n)
        self.tree = BallTree(self.train_X)

        self.train_Y = np.asarray(train_Y).astype(int)
        self.k = k
        self.weights = weights

        print("scoring training data")
        self.train_scores = self.outlier_score(self.train_X)

        print("thresholding training scores")
        self.threshold, self.p = gamma_threshold(self.train_scores, threshold)
Exemple #19
0
    def fit(self, X):
        centroids_dict = defaultdict(int)
        seeds = self.init_seeds(X)
        ball_tree = BallTree(X)
        for weighted_mean in seeds:
            for i in range(self.max_iterations):
                prev_weighted_mean = weighted_mean
                points_within = X[ball_tree.query_radius([prev_weighted_mean],
                                                         self.bandwidth)[0]]
                weighted_mean = self.update_kernel_fn(prev_weighted_mean,
                                                      points_within,
                                                      self.bandwidth)

                if (np.linalg.norm(weighted_mean - prev_weighted_mean) <
                        self.tol * self.bandwidth):
                    break

            centroids_dict[tuple(weighted_mean)] = len(points_within)

        self.centroids_ = self._remove_overlapping_windows(centroids_dict)
        self.labels_ = np.array([self._closest_centroid(x) for x in X])

        return self
Exemple #20
0
def associate(rad_1, rad_2, k_nn=1):
    """
    Given two grids rad_1 and rad_2, this associates each point in rad_2 to the k-nearest neighbours in
    rad_1.
    Pairs of the form [latitude, longitude]
    """

    # Room to improvement:
    # - Run the Ball tree on the smallest net
    # - Use something more efficient than a Ball Tree, like a binary search.

    # Build Ball Tree
    Ball = BallTree(rad_1, metric='haversine')

    # Searching Data
    distances, indices = Ball.query(rad_2,
                                    k=k_nn,
                                    breadth_first=True,
                                    return_distance=True)

    assert rad_2.shape[0] == indices.shape[0]

    return distances, indices
Exemple #21
0
    def _get_unassigned_balltree(self):
        """
        Use BallTree to find nearest clusters
        """
        k = self.clusters.k

        if k == 1:
            return super(FastDPMeans, self).get_unassigned()

        tree = BallTree(self.centers, leaf_size=k + 1)

        neigh, _ = tree.query_radius(self.data,
                                     self.cutoff,
                                     sort_results=True,
                                     return_distance=True)

        n_neigh = np.array(list(map(len, neigh)))
        assigned = np.nonzero(n_neigh > 0)[0]
        unassigned = np.nonzero(n_neigh == 0)[0]

        self.clusters.labels[assigned] = [neigh[i][0] for i in assigned]

        return unassigned
	def particle_position(self, position=None, leaf_size=None, metric=None, position_min=None, position_max=None):
		if leaf_size is not None: self.leaf_size = leaf_size
		if metric is not None: self.metric = metric
		if position is None: 
			print('Input particle positions using particle_position().')
			return None
		else: 
			self.position = position

		if isinstance(self.position, (pd.core.frame.DataFrame)):
			X = np.vstack((self.position.x, self.position.y, self.position.z)).T
		else:
			X = self.position

		position_min = X.min() if position_min is None else position_min
		position_max = X.max() if position_max is None else position_max

		X = (X-position_min)/(position_max-position_min)

		print('Building tree...')
		# self.tree = KDTree(X, leaf_size=self.leaf_size, metric=self.metric)
		self.tree = BallTree(X, leaf_size=self.leaf_size, metric=self.metric)
		print('Tree built with the positions.')
Exemple #23
0
def _rsl_prims_balltree(X, k=5, alpha=1.4142135623730951, metric='minkowski', p=2):
    if metric == 'minkowski':
        if p is None:
            raise TypeError('Minkowski metric given but no p value supplied!')
        if p < 0:
            raise ValueError('Minkowski metric with negative p value is not defined!')
    elif p is None:
        p = 2  # Unused, but needs to be integer; assume euclidean

    dim = X.shape[0]
    k = min(dim - 1, k)

    tree = BallTree(X, metric=metric)

    dist_metric = DistanceMetric.get_metric(metric)

    core_distances = tree.query(X, k=k)[0][:, -1]
    min_spanning_tree = mst_linkage_core_vector(X, core_distances, dist_metric, alpha)

    single_linkage_tree = label(min_spanning_tree)
    single_linkage_tree = SingleLinkageTree(single_linkage_tree)

    return single_linkage_tree
Exemple #24
0
    def _remove_overlapping_windows(self, centroids_dict):
        '''
        Removes overlapping windows
        :param centroids_dict: Dictionary with windows positions and a list of points that each window has.
        :return: Filtered windows.
        '''
        centroids_by_intensity = sorted(centroids_dict.items(),
                                        key=lambda tup: tup[1],
                                        reverse=True)
        centroids = np.array([
            centroid for centroid, size in centroids_by_intensity
            if size >= self.min_bin_size
        ])
        unique = np.ones(len(centroids), dtype=bool)
        nbrs = BallTree(centroids)

        for centroid_ind, centroid in enumerate(centroids):
            if (unique[centroid_ind]):
                indexes = nbrs.query_radius([centroid], self.bandwidth)[0]
                unique[indexes] = False
                unique[centroid_ind] = True

        return centroids[unique]
Exemple #25
0
def find_partial_connected_components(data,cutoff=15.0):

    import networkx as nx
    import numpy as np
    from sklearn.neighbors import BallTree
    
    tree = BallTree(data, leaf_size=40)
    edges = tree.query_radius(data, cutoff)
    edge_list=[list(zip(np.repeat(idx, len(dest_list)), \
            dest_list)) for idx, dest_list in enumerate(edges)]

    edge_list_flat = np.array([list(item) \
            for sublist in edge_list for item in sublist])
    res = edge_list_flat
    res_tree = edge_list_flat[edge_list_flat[:,0]<edge_list_flat[:,1], :]

    graph =nx.from_edgelist(res_tree)

    # partial connected components

    connected_components = nx.connected_components(graph)
    for x in connected_components:
        yield [x]
Exemple #26
0
def _rsl_boruvka_balltree(X,
                          k=5,
                          alpha=1.0,
                          metric='euclidean',
                          leaf_size=40,
                          **kwargs):

    dim = X.shape[0]
    min_samples = min(dim - 1, k)

    tree = BallTree(X, metric=metric, leaf_size=leaf_size, **kwargs)
    alg = BallTreeBoruvkaAlgorithm(tree,
                                   min_samples,
                                   metric=metric,
                                   alpha=alpha,
                                   leaf_size=leaf_size,
                                   **kwargs)
    min_spanning_tree = alg.spanning_tree()

    single_linkage_tree = label(min_spanning_tree)
    single_linkage_tree = SingleLinkageTree(single_linkage_tree)

    return single_linkage_tree
Exemple #27
0
def get_nearest(src_points, candidates, k_neighbors=2):
  """
    converts lat-long coords to great-circle distance and
    returns the two closests
  """

  # Create tree from the candidate points
    tree = BallTree(candidates, leaf_size=20, metric='haversine')

    # Find closest points and distances
    distances, indices = tree.query(src_points, k=k_neighbors)

    # Transpose to get distances and indices into arrays
    distances = distances.transpose()
    indices = indices.transpose()

    #Get closest indices and distances (i.e. array at index 0)
    #note: for the second closest points, you would take index 1, etc.
    closest = indices[0:2]
    closest_dist = distances[0:2]


    return (closest, closest_dist)
Exemple #28
0
def find_hits_for_targets(
    *,
    targets: List[Tuple[float, ...]],
    predictions: List[Tuple[float, ...]],
    radius: float,
) -> List[Tuple[int, ...]]:
    """
    Generates a list of the predicted points that are within a radius r of the
    targets. The indicies are returned in sorted order, from closest to
    farthest point.

    Parameters
    ----------
    targets
        A list of target points
    predictions
        A list of predicted points
    radius
        The maximum distance that two points can be apart for them to be
        considered a hit

    Returns
    -------

    A list which has the same length as the targets list. Each element within
    this list contains another list that contains the indicies of the
    predictions that are considered hits.

    """
    predictions_tree = BallTree(predictions)
    hits, _ = predictions_tree.query_radius(
        X=targets,
        r=radius,
        return_distance=True,
        sort_results=True,
    )
    return hits
Exemple #29
0
def _hdbscan_boruvka_balltree(X,
                              min_samples=5,
                              alpha=1.0,
                              metric='minkowski',
                              p=2,
                              leaf_size=40,
                              approx_min_span_tree=True,
                              gen_min_span_tree=False,
                              core_dist_n_jobs=4,
                              **kwargs):

    if leaf_size < 3:
        leaf_size = 3

    if core_dist_n_jobs < 1:
        raise ValueError(
            'Parallel core distance computation requires 1 or more jobs!')

    tree = BallTree(X, metric=metric, leaf_size=leaf_size, **kwargs)
    alg = BallTreeBoruvkaAlgorithm(tree,
                                   min_samples,
                                   metric=metric,
                                   leaf_size=leaf_size // 3,
                                   approx_min_span_tree=approx_min_span_tree,
                                   n_jobs=core_dist_n_jobs,
                                   **kwargs)
    min_spanning_tree = alg.spanning_tree()
    # Sort edges of the min_spanning_tree by weight
    min_spanning_tree = min_spanning_tree[
        np.argsort(min_spanning_tree.T[2]), :]
    # Convert edge list into standard hierarchical clustering format
    single_linkage_tree = label(min_spanning_tree)

    if gen_min_span_tree:
        return single_linkage_tree, min_spanning_tree
    else:
        return single_linkage_tree, None
def index_nn_haversine(centroids, coordinates, threshold=THRESHOLD):
    """Compute the neareast centroid for each coordinate using a Ball
    tree with haversine distance.

    Parameters:
        centroids (2d array): First column contains latitude, second
            column contains longitude. Each row is a geographic point
        coordinates (2d array): First column contains latitude, second
            column contains longitude. Each row is a geographic point
        threshold (float): distance threshold in km over which no neighbor will
            be found. Those are assigned with a -1 index

    Returns:
        array with so many rows as coordinates containing the centroids
            indexes
    """
    # Construct tree from centroids
    tree = BallTree(np.radians(centroids), metric='haversine')
    # Select unique exposures coordinates
    _, idx, inv = np.unique(coordinates, axis=0, return_index=True,
                            return_inverse=True)

    # query the k closest points of the n_points using dual tree
    dist, assigned = tree.query(np.radians(coordinates[idx]), k=1,
                                return_distance=True, dualtree=True,
                                breadth_first=False)

    # Raise a warning if the minimum distance is greater than the
    # threshold and set an unvalid index -1
    num_warn = np.sum(dist * EARTH_RADIUS_KM > threshold)
    if num_warn:
        LOGGER.warning('Distance to closest centroid is greater than %s'
                       'km for %s coordinates.', threshold, num_warn)
        assigned[dist * EARTH_RADIUS_KM > threshold] = -1

    # Copy result to all exposures and return value
    return np.squeeze(assigned[inv])