Exemple #1
0
def nn_descent(
    inds,
    indptr,
    data,
    n_neighbors,
    rng_state,
    max_candidates=50,
    dist=sparse_euclidean,
    n_iters=10,
    delta=0.001,
    rp_tree_init=True,
    leaf_array=None,
    low_memory=False,
    verbose=False,
    seed_per_row=False,
):

    n_samples = indptr.shape[0] - 1
    current_graph = make_heap(n_samples, n_neighbors)

    if rp_tree_init:
        init_rp_tree(inds, indptr, data, dist, current_graph, leaf_array)

    init_random(n_neighbors, inds, indptr, data, current_graph, dist, rng_state)

    if low_memory:
        nn_descent_internal_low_memory_parallel(
            current_graph,
            inds,
            indptr,
            data,
            n_neighbors,
            rng_state,
            max_candidates=max_candidates,
            dist=dist,
            n_iters=n_iters,
            delta=delta,
            verbose=verbose,
            seed_per_row=seed_per_row,
        )
    else:
        nn_descent_internal_high_memory_parallel(
            current_graph,
            inds,
            indptr,
            data,
            n_neighbors,
            rng_state,
            max_candidates=max_candidates,
            dist=dist,
            n_iters=n_iters,
            delta=delta,
            verbose=verbose,
            seed_per_row=seed_per_row,
        )

    return deheap_sort(current_graph)
Exemple #2
0
    def query(self, query_data, k=10, queue_size=5.0):
        """Query the training data for the k nearest neighbors

        Parameters
        ----------
        query_data: array-like, last dimension self.dim
            An array of points to query

        k: integer (default = 10)
            The number of nearest neighbors to return

        queue_size: float (default 5.0)
            The multiplier of the internal search queue. This controls the
            speed/accuracy tradeoff. Low values will search faster but with
            more approximate results. High values will search more
            accurately, but will require more computation to do so. Values
            should generally be in the range 1.0 to 10.0.

        Returns
        -------
        indices, distances: array (n_query_points, k), array (n_query_points, k)
            The first array, ``indices``, provides the indices of the data
            points in the training set that are the nearest neighbors of
            each query point. Thus ``indices[i, j]`` is the index into the
            training data of the jth nearest neighbor of the ith query points.

            Similarly ``distances`` provides the distances to the neighbors
            of the query points such that ``distances[i, j]`` is the distance
            from the ith query point to its jth nearest neighbor in the
            training data.
        """
        # query_data = check_array(query_data, dtype=np.float64, order='C')
        query_data = np.asarray(query_data).astype(np.float32)
        self._init_search_graph()
        init = initialise_search(
            self._rp_forest,
            self._raw_data,
            query_data,
            int(k * queue_size),
            self._random_init,
            self._tree_init,
            self.rng_state,
        )
        result = self._search(
            self._raw_data,
            self._search_graph.indptr,
            self._search_graph.indices,
            init,
            query_data,
        )

        indices, dists = deheap_sort(result)
        return indices[:, :k], dists[:, :k]
def find_component_connection_edge(
    component1,
    component2,
    search_closure,
    raw_data,
    visited,
    rng_state,
    search_size=10,
    epsilon=0.0,
):
    indices = [np.zeros(1, dtype=np.int64) for i in range(2)]
    indices[0] = component1[rejection_sample(np.int64(search_size),
                                             component1.shape[0], rng_state)]
    indices[1] = component2[rejection_sample(np.int64(search_size),
                                             component2.shape[0], rng_state)]
    query_side = 0
    query_points = raw_data[indices[query_side]]
    candidate_indices = indices[1 - query_side].copy()
    changed = [True, True]
    best_dist = np.inf
    best_edge = (indices[0][0], indices[1][0])

    while changed[0] or changed[1]:
        inds, dists, _ = search_closure(query_points, candidate_indices,
                                        search_size, epsilon, visited)
        inds, dists = deheap_sort(inds, dists)
        for i in range(dists.shape[0]):
            for j in range(dists.shape[1]):
                if dists[i, j] < best_dist:
                    best_dist = dists[i, j]
                    best_edge = (indices[query_side][i], inds[i, j])
        candidate_indices = indices[query_side]
        new_indices = np.unique(inds[:, 0])
        if indices[1 - query_side].shape[0] == new_indices.shape[0]:
            changed[1 - query_side] = np.any(
                indices[1 - query_side] != new_indices)
        indices[1 - query_side] = new_indices
        query_points = raw_data[indices[1 - query_side]]
        query_side = 1 - query_side

    return best_edge[0], best_edge[1], best_dist
    def __init__(
        self,
        data,
        metric="euclidean",
        metric_kwds=None,
        n_neighbors=15,
        n_trees=None,
        leaf_size=None,
        pruning_level=0,
        tree_init=True,
        random_state=np.random,
        algorithm="standard",
        max_candidates=20,
        n_iters=None,
        delta=0.001,
        rho=0.5,
        n_jobs=None,
        seed_per_row=False,
        verbose=False,
    ):

        if n_trees is None:
            n_trees = 5 + int(round((data.shape[0])**0.5 / 20.0))
        if n_iters is None:
            n_iters = max(5, int(round(np.log2(data.shape[0]))))

        self.n_trees = n_trees
        self.n_neighbors = n_neighbors
        self.metric = metric
        self.metric_kwds = metric_kwds
        self.leaf_size = leaf_size
        self.prune_level = pruning_level
        self.max_candidates = max_candidates
        self.n_iters = n_iters
        self.delta = delta
        self.rho = rho
        self.dim = data.shape[1]
        self.verbose = verbose

        data = check_array(data, dtype=np.float32, accept_sparse="csr")
        self._raw_data = data

        if not tree_init or n_trees == 0:
            self.tree_init = False
        else:
            self.tree_init = True

        metric_kwds = metric_kwds or {}
        self._dist_args = tuple(metric_kwds.values())

        self.random_state = check_random_state(random_state)

        if callable(metric):
            self._distance_func = metric
        elif metric in dist.named_distances:
            self._distance_func = dist.named_distances[metric]
        else:
            raise ValueError("Metric is neither callable, " +
                             "nor a recognised string")

        if metric in ("cosine", "correlation", "dice", "jaccard"):
            self._angular_trees = True
        else:
            self._angular_trees = False

        self.rng_state = self.random_state.randint(INT32_MIN, INT32_MAX,
                                                   3).astype(np.int64)

        if self.tree_init:
            if verbose:
                print(ts(), "Building RP forest with", str(n_trees), "trees")
            self._rp_forest = make_forest(
                data,
                n_neighbors,
                n_trees,
                leaf_size,
                self.rng_state,
                self._angular_trees,
            )
            leaf_array = rptree_leaf_array(self._rp_forest)
        else:
            self._rp_forest = None
            leaf_array = np.array([[-1]])

        if threaded.effective_n_jobs_with_context(n_jobs) != 1:
            if algorithm != "standard":
                raise ValueError(
                    "Algorithm {} not supported in parallel mode".format(
                        algorithm))
            if isspmatrix_csr(self._raw_data):
                raise ValueError(
                    "Sparse input is not currently supported in parallel mode")
            if verbose:
                print(ts(), "parallel NN descent for", str(n_iters),
                      "iterations")

            if isspmatrix_csr(self._raw_data):
                # Sparse case
                self._is_sparse = True
                if metric in sparse.sparse_named_distances:
                    self._distance_func = sparse.sparse_named_distances[metric]
                    if metric in sparse.sparse_need_n_features:
                        metric_kwds["n_features"] = self._raw_data.shape[1]
                    self._dist_args = tuple(metric_kwds.values())
                else:
                    raise ValueError(
                        "Metric {} not supported for sparse data".format(
                            metric))
                self._neighbor_graph = sparse_threaded.sparse_nn_descent(
                    self._raw_data.indices,
                    self._raw_data.indptr,
                    self._raw_data.data,
                    self._raw_data.shape[0],
                    self.n_neighbors,
                    self.rng_state,
                    self.max_candidates,
                    self._distance_func,
                    self._dist_args,
                    self.n_iters,
                    self.delta,
                    self.rho,
                    rp_tree_init=self.tree_init,
                    leaf_array=leaf_array,
                    verbose=verbose,
                    n_jobs=n_jobs,
                    seed_per_row=seed_per_row,
                )
            else:
                # Regular case
                self._is_sparse = False
                self._neighbor_graph = threaded.nn_descent(
                    self._raw_data,
                    self.n_neighbors,
                    self.rng_state,
                    self.max_candidates,
                    self._distance_func,
                    self._dist_args,
                    self.n_iters,
                    self.delta,
                    self.rho,
                    rp_tree_init=self.tree_init,
                    leaf_array=leaf_array,
                    verbose=verbose,
                    n_jobs=n_jobs,
                    seed_per_row=seed_per_row,
                )
        elif algorithm == "standard" or leaf_array.shape[0] == 1:
            if isspmatrix_csr(self._raw_data):

                self._is_sparse = True

                if metric in sparse.sparse_named_distances:
                    self._distance_func = sparse.sparse_named_distances[metric]
                    if metric in sparse.sparse_need_n_features:
                        metric_kwds["n_features"] = self._raw_data.shape[1]
                    self._dist_args = tuple(metric_kwds.values())
                else:
                    raise ValueError(
                        "Metric {} not supported for sparse data".format(
                            metric))

                if verbose:
                    print(ts(), "metric NN descent for", str(n_iters),
                          "iterations")

                self._neighbor_graph = sparse_nnd.sparse_nn_descent(
                    self._raw_data.indices,
                    self._raw_data.indptr,
                    self._raw_data.data,
                    self._raw_data.shape[0],
                    self.n_neighbors,
                    self.rng_state,
                    self.max_candidates,
                    sparse_dist=self._distance_func,
                    dist_args=self._dist_args,
                    n_iters=self.n_iters,
                    rp_tree_init=False,
                    leaf_array=leaf_array,
                    verbose=verbose,
                )

            else:

                self._is_sparse = False

                if verbose:
                    print(ts(), "NN descent for", str(n_iters), "iterations")

                self._neighbor_graph = nn_descent(
                    self._raw_data,
                    self.n_neighbors,
                    self.rng_state,
                    self.max_candidates,
                    self._distance_func,
                    self._dist_args,
                    self.n_iters,
                    self.delta,
                    self.rho,
                    rp_tree_init=True,
                    leaf_array=leaf_array,
                    verbose=verbose,
                    seed_per_row=seed_per_row,
                )
        elif algorithm == "alternative":

            self._is_sparse = False

            if verbose:
                print(ts(), "Using alternative algorithm")

            graph_heap, search_heap = initialize_heaps(
                self._raw_data,
                self.n_neighbors,
                leaf_array,
                self._distance_func,
                self._dist_args,
            )
            graph = lil_matrix((data.shape[0], data.shape[0]))
            graph.rows, graph.data = deheap_sort(graph_heap)
            graph = graph.maximum(graph.transpose())
            self._neighbor_graph = deheap_sort(
                initialized_nnd_search(
                    self._raw_data,
                    graph.indptr,
                    graph.indices,
                    search_heap,
                    self._raw_data,
                    self._distance_func,
                    self._dist_args,
                ))
        else:
            raise ValueError("Unknown algorithm selected")

        if np.any(self._neighbor_graph[0] < 0):
            warn("Failed to correctly find n_neighbors for some samples."
                 "Results may be less than ideal. Try re-running with"
                 "different parameters.")
def nn_descent(
    data,
    n_neighbors,
    rng_state,
    max_candidates=50,
    dist=dist.euclidean,
    dist_args=(),
    n_iters=10,
    delta=0.001,
    rho=0.5,
    rp_tree_init=True,
    leaf_array=None,
    verbose=False,
    seed_per_row=False,
):
    n_vertices = data.shape[0]
    tried = set([(-1, -1)])

    current_graph = make_heap(data.shape[0], n_neighbors)
    for i in range(data.shape[0]):
        if seed_per_row:
            seed(rng_state, i)
        indices = rejection_sample(n_neighbors, data.shape[0], rng_state)
        for j in range(indices.shape[0]):
            d = dist(data[i], data[indices[j]], *dist_args)
            heap_push(current_graph, i, d, indices[j], 1)
            heap_push(current_graph, indices[j], d, i, 1)
            tried.add((i, indices[j]))
            tried.add((indices[j], i))

    if rp_tree_init:
        init_rp_tree(data,
                     dist,
                     dist_args,
                     current_graph,
                     leaf_array,
                     tried=tried)

    for n in range(n_iters):
        if verbose:
            print("\t", n, " / ", n_iters)

        (new_candidate_neighbors,
         old_candidate_neighbors) = new_build_candidates(
             current_graph,
             n_vertices,
             n_neighbors,
             max_candidates,
             rng_state,
             rho,
             seed_per_row,
         )

        c = 0
        for i in range(n_vertices):
            for j in range(max_candidates):
                p = int(new_candidate_neighbors[0, i, j])
                if p < 0:
                    continue
                for k in range(j, max_candidates):
                    q = int(new_candidate_neighbors[0, i, k])
                    if q < 0 or (p, q) in tried:
                        continue

                    d = dist(data[p], data[q], *dist_args)
                    c += unchecked_heap_push(current_graph, p, d, q, 1)
                    tried.add((p, q))
                    if p != q:
                        c += unchecked_heap_push(current_graph, q, d, p, 1)
                        tried.add((q, p))

                for k in range(max_candidates):
                    q = int(old_candidate_neighbors[0, i, k])
                    if q < 0 or (p, q) in tried:
                        continue

                    d = dist(data[p], data[q], *dist_args)
                    c += unchecked_heap_push(current_graph, p, d, q, 1)
                    tried.add((p, q))
                    if p != q:
                        c += unchecked_heap_push(current_graph, q, d, p, 1)
                        tried.add((q, p))

        if c <= delta * n_neighbors * data.shape[0]:
            break

    return deheap_sort(current_graph)
Exemple #6
0
def nn_descent(
    inds,
    indptr,
    data,
    n_neighbors,
    rng_state,
    max_candidates=50,
    dist=sparse_euclidean,
    n_iters=10,
    delta=0.001,
    init_graph=EMPTY_GRAPH,
    rp_tree_init=True,
    leaf_array=None,
    low_memory=False,
    verbose=False,
):

    n_samples = indptr.shape[0] - 1

    if init_graph[0].shape[0] == 1:  # EMPTY_GRAPH
        current_graph = make_heap(n_samples, n_neighbors)

        if rp_tree_init:
            init_rp_tree(inds, indptr, data, dist, current_graph, leaf_array)

        init_random(n_neighbors, inds, indptr, data, current_graph, dist, rng_state)
    elif init_graph[0].shape[0] == n_samples and init_graph[0].shape[1] == n_neighbors:
        current_graph = init_graph
    else:
        raise ValueError("Invalid initial graph specified!")

    if low_memory:
        nn_descent_internal_low_memory_parallel(
            current_graph,
            inds,
            indptr,
            data,
            n_neighbors,
            rng_state,
            max_candidates=max_candidates,
            dist=dist,
            n_iters=n_iters,
            delta=delta,
            verbose=verbose,
        )
    else:
        nn_descent_internal_high_memory_parallel(
            current_graph,
            inds,
            indptr,
            data,
            n_neighbors,
            rng_state,
            max_candidates=max_candidates,
            dist=dist,
            n_iters=n_iters,
            delta=delta,
            verbose=verbose,
        )

    return deheap_sort(current_graph[0], current_graph[1])
Exemple #7
0
def sparse_nn_descent(
    inds,
    indptr,
    data,
    n_vertices,
    n_neighbors,
    rng_state,
    max_candidates=50,
    sparse_dist=sparse_euclidean,
    dist_args=(),
    n_iters=10,
    delta=0.001,
    rho=0.5,
    low_memory=False,
    rp_tree_init=True,
    leaf_array=None,
    verbose=False,
):

    tried = set([(-1, -1)])

    current_graph = make_heap(n_vertices, n_neighbors)
    for i in range(n_vertices):
        indices = rejection_sample(n_neighbors, n_vertices, rng_state)
        for j in range(indices.shape[0]):

            from_inds = inds[indptr[i]:indptr[i + 1]]
            from_data = data[indptr[i]:indptr[i + 1]]

            to_inds = inds[indptr[indices[j]]:indptr[indices[j] + 1]]
            to_data = data[indptr[indices[j]]:indptr[indices[j] + 1]]

            d = sparse_dist(from_inds, from_data, to_inds, to_data, *dist_args)

            heap_push(current_graph, i, d, indices[j], 1)
            heap_push(current_graph, indices[j], d, i, 1)
            tried.add((i, indices[j]))
            tried.add((indices[j], i))

    if rp_tree_init:
        sparse_init_rp_tree(
            inds,
            indptr,
            data,
            sparse_dist,
            dist_args,
            current_graph,
            leaf_array,
            tried=tried,
        )

    if low_memory:
        sparse_nn_descent_internal_low_memory(
            current_graph,
            inds,
            indptr,
            data,
            n_vertices,
            n_neighbors,
            rng_state,
            max_candidates=max_candidates,
            sparse_dist=sparse_dist,
            dist_args=dist_args,
            n_iters=n_iters,
            delta=delta,
            rho=rho,
            verbose=verbose,
        )
    else:
        sparse_nn_descent_internal_high_memory(
            current_graph,
            inds,
            indptr,
            data,
            n_vertices,
            n_neighbors,
            rng_state,
            tried,
            max_candidates=max_candidates,
            sparse_dist=sparse_dist,
            dist_args=dist_args,
            n_iters=n_iters,
            delta=delta,
            rho=rho,
            verbose=verbose,
        )

    return deheap_sort(current_graph)
Exemple #8
0
def nn_descent(
    data,
    n_neighbors,
    rng_state,
    max_candidates=50,
    dist=dist.euclidean,
    dist_args=(),
    n_iters=10,
    delta=0.001,
    rho=0.5,
    rp_tree_init=True,
    leaf_array=None,
    low_memory=False,
    verbose=False,
    seed_per_row=False,
):
    tried = set([(-1, -1)])

    current_graph = make_heap(data.shape[0], n_neighbors)
    for i in range(data.shape[0]):
        if seed_per_row:
            seed(rng_state, i)
        indices = rejection_sample(n_neighbors, data.shape[0], rng_state)
        for j in range(indices.shape[0]):
            d = dist(data[i], data[indices[j]], *dist_args)
            heap_push(current_graph, i, d, indices[j], 1)
            heap_push(current_graph, indices[j], d, i, 1)
            tried.add((i, indices[j]))
            tried.add((indices[j], i))

    if rp_tree_init:
        init_rp_tree(data,
                     dist,
                     dist_args,
                     current_graph,
                     leaf_array,
                     tried=tried)

    if low_memory:
        nn_descent_internal_low_memory(
            current_graph,
            data,
            n_neighbors,
            rng_state,
            max_candidates=max_candidates,
            dist=dist,
            dist_args=dist_args,
            n_iters=n_iters,
            delta=delta,
            rho=rho,
            verbose=verbose,
            seed_per_row=seed_per_row,
        )
    else:
        nn_descent_internal_high_memory(
            current_graph,
            data,
            n_neighbors,
            rng_state,
            tried,
            max_candidates=max_candidates,
            dist=dist,
            dist_args=dist_args,
            n_iters=n_iters,
            delta=delta,
            rho=rho,
            verbose=verbose,
            seed_per_row=seed_per_row,
        )

    return deheap_sort(current_graph)
Exemple #9
0
def nn_descent(data,
               n_neighbors,
               rng_state,
               max_candidates=50,
               dist=dist.euclidean,
               dist_args=(),
               n_iters=10,
               delta=0.001,
               rho=0.5,
               rp_tree_init=True,
               leaf_array=None,
               verbose=False):
    n_vertices = data.shape[0]

    current_graph = make_heap(data.shape[0], n_neighbors)
    for i in range(data.shape[0]):
        indices = rejection_sample(n_neighbors, data.shape[0], rng_state)
        for j in range(indices.shape[0]):
            d = dist(data[i], data[indices[j]], *dist_args)
            heap_push(current_graph, i, d, indices[j], 1)
            heap_push(current_graph, indices[j], d, i, 1)

    if rp_tree_init:
        for n in range(leaf_array.shape[0]):
            tried = set([(-1, -1)])
            for i in range(leaf_array.shape[1]):
                if leaf_array[n, i] < 0:
                    break
                for j in range(i + 1, leaf_array.shape[1]):
                    if leaf_array[n, j] < 0:
                        break
                    if (leaf_array[n, i], leaf_array[n, j]) in tried:
                        continue
                    d = dist(data[leaf_array[n, i]], data[leaf_array[n, j]],
                             *dist_args)
                    heap_push(current_graph, leaf_array[n, i], d,
                              leaf_array[n, j], 1)
                    heap_push(current_graph, leaf_array[n, j], d,
                              leaf_array[n, i], 1)
                    tried.add((leaf_array[n, i], leaf_array[n, j]))
                    tried.add((leaf_array[n, j], leaf_array[n, i]))

    for n in range(n_iters):

        (new_candidate_neighbors, old_candidate_neighbors) = build_candidates(
            current_graph, n_vertices, n_neighbors, max_candidates, rng_state,
            rho)

        c = 0
        for i in range(n_vertices):
            for j in range(max_candidates):
                p = int(new_candidate_neighbors[0, i, j])
                if p < 0:
                    continue
                for k in range(j, max_candidates):
                    q = int(new_candidate_neighbors[0, i, k])
                    if q < 0:
                        continue

                    d = dist(data[p], data[q], *dist_args)
                    c += heap_push(current_graph, p, d, q, 1)
                    c += heap_push(current_graph, q, d, p, 1)

                for k in range(max_candidates):
                    q = int(old_candidate_neighbors[0, i, k])
                    if q < 0:
                        continue

                    d = dist(data[p], data[q], *dist_args)
                    c += heap_push(current_graph, p, d, q, 1)
                    c += heap_push(current_graph, q, d, p, 1)

        if c <= delta * n_neighbors * data.shape[0]:
            break

    return deheap_sort(current_graph)
Exemple #10
0
    def __init__(self,
                 data,
                 metric='euclidean',
                 metric_kwds={},
                 n_neighbors=15,
                 n_trees=8,
                 leaf_size=15,
                 pruning_level=0,
                 tree_init=True,
                 random_state=np.random,
                 algorithm='standard',
                 max_candidates=20,
                 n_iters=10,
                 delta=0.001,
                 rho=0.5):

        self.n_trees = n_trees
        self.n_neighbors = n_neighbors
        self.metric = metric
        self.metric_kwds = metric_kwds
        self.leaf_size = leaf_size
        self.prune_level = pruning_level
        self.max_candidates = max_candidates
        self.n_iters = n_iters
        self.delta = delta
        self.rho = rho
        self.dim = data.shape[1]

        data = check_array(data).astype(np.float32)

        if not tree_init or n_trees == 0:
            self.tree_init = False
        else:
            self.tree_init = True

        self._dist_args = tuple(metric_kwds.values())

        self.random_state = check_random_state(random_state)

        self._raw_data = data.copy()

        if callable(metric):
            self._distance_func = metric
        elif metric in dist.named_distances:
            self._distance_func = dist.named_distances[metric]

        if metric in ('cosine', 'correlation', 'dice', 'jaccard'):
            self._angular_trees = True
        else:
            self._angular_trees = False

        self.rng_state = \
            random_state.randint(INT32_MIN, INT32_MAX, 3).astype(np.int64)

        indices = np.arange(data.shape[0])

        if self.tree_init:
            if self._angular_trees:
                self._rp_forest = [
                    flatten_tree(
                        make_angular_tree(data, indices, self.rng_state,
                                          self.leaf_size), self.leaf_size)
                    for i in range(n_trees)
                ]
            else:
                self._rp_forest = [
                    flatten_tree(
                        make_euclidean_tree(data, indices, self.rng_state,
                                            self.leaf_size), self.leaf_size)
                    for i in range(n_trees)
                ]

            leaf_array = np.vstack([tree.indices for tree in self._rp_forest])
        else:
            self._rp_forest = None
            leaf_array = np.array([[-1]])

        if algorithm == 'standard' or leaf_array.shape[0] == 1:
            self._neighbor_graph = nn_descent(
                self._raw_data, self.n_neighbors, self.rng_state,
                self.max_candidates, self._distance_func, self._dist_args,
                self.n_iters, self.delta, self.rho, True, leaf_array)
        elif algorithm == 'alternative':
            self._search = make_initialized_nnd_search(self._distance_func,
                                                       self._dist_args)

            graph_heap, search_heap = initialize_heaps(self._raw_data,
                                                       self.n_neighbors,
                                                       leaf_array,
                                                       self._distance_func,
                                                       self._dist_args)
            graph = lil_matrix((data.shape[0], data.shape[0]))
            graph.rows, graph.data = deheap_sort(graph_heap)
            graph = graph.maximum(graph.transpose())
            self._neighbor_graph = deheap_sort(
                self._search(self._raw_data, graph.indptr, graph.indices,
                             search_heap, self._raw_data))
        else:
            raise ValueError('Unknown algorithm selected')

        self._search_graph = lil_matrix((data.shape[0], data.shape[0]),
                                        dtype=np.float32)
        self._search_graph.rows = self._neighbor_graph[0]
        self._search_graph.data = self._neighbor_graph[1]
        self._search_graph = self._search_graph.maximum(
            self._search_graph.transpose()).tocsr()
        self._search_graph = prune(self._search_graph,
                                   prune_level=self.prune_level,
                                   n_neighbors=self.n_neighbors)
        self._search_graph = (self._search_graph != 0).astype(np.int8)

        self._random_init, self._tree_init = make_initialisations(
            self._distance_func, self._dist_args)

        self._search = make_initialized_nnd_search(self._distance_func,
                                                   self._dist_args)

        return
Exemple #11
0
def sparse_nn_descent(
    inds,
    indptr,
    data,
    n_vertices,
    n_neighbors,
    rng_state,
    max_candidates=50,
    sparse_dist=sparse_euclidean,
    dist_args=(),
    n_iters=10,
    delta=0.001,
    rho=0.5,
    rp_tree_init=True,
    leaf_array=None,
    verbose=False,
):

    tried = set([(-1, -1)])

    current_graph = make_heap(n_vertices, n_neighbors)
    for i in range(n_vertices):
        indices = rejection_sample(n_neighbors, n_vertices, rng_state)
        for j in range(indices.shape[0]):

            from_inds = inds[indptr[i]:indptr[i + 1]]
            from_data = data[indptr[i]:indptr[i + 1]]

            to_inds = inds[indptr[indices[j]]:indptr[indices[j] + 1]]
            to_data = data[indptr[indices[j]]:indptr[indices[j] + 1]]

            d = sparse_dist(from_inds, from_data, to_inds, to_data, *dist_args)

            heap_push(current_graph, i, d, indices[j], 1)
            heap_push(current_graph, indices[j], d, i, 1)
            tried.add((i, indices[j]))
            tried.add((indices[j], i))

    if rp_tree_init:
        sparse_init_rp_tree(
            inds,
            indptr,
            data,
            sparse_dist,
            dist_args,
            current_graph,
            leaf_array,
            tried=tried,
        )

    for n in range(n_iters):
        if verbose:
            print("\t", n, " / ", n_iters)

        (new_candidate_neighbors,
         old_candidate_neighbors) = new_build_candidates(
             current_graph,
             n_vertices,
             n_neighbors,
             max_candidates,
             rng_state,
             rho,
             False,
         )

        c = 0
        for i in range(n_vertices):
            for j in range(max_candidates):
                p = int(new_candidate_neighbors[0, i, j])
                if p < 0:
                    continue
                for k in range(j, max_candidates):
                    q = int(new_candidate_neighbors[0, i, k])
                    if q < 0 or (p, q) in tried:
                        continue

                    from_inds = inds[indptr[p]:indptr[p + 1]]
                    from_data = data[indptr[p]:indptr[p + 1]]

                    to_inds = inds[indptr[q]:indptr[q + 1]]
                    to_data = data[indptr[q]:indptr[q + 1]]

                    d = sparse_dist(from_inds, from_data, to_inds, to_data,
                                    *dist_args)

                    c += unchecked_heap_push(current_graph, p, d, q, 1)
                    tried.add((p, q))
                    if p != q:
                        c += unchecked_heap_push(current_graph, q, d, p, 1)
                        tried.add((q, p))

                for k in range(max_candidates):
                    q = int(old_candidate_neighbors[0, i, k])
                    if q < 0 or (p, q) in tried:
                        continue

                    from_inds = inds[indptr[p]:indptr[p + 1]]
                    from_data = data[indptr[p]:indptr[p + 1]]

                    to_inds = inds[indptr[q]:indptr[q + 1]]
                    to_data = data[indptr[q]:indptr[q + 1]]

                    d = sparse_dist(from_inds, from_data, to_inds, to_data,
                                    *dist_args)

                    c += unchecked_heap_push(current_graph, p, d, q, 1)
                    tried.add((p, q))
                    if p != q:
                        c += unchecked_heap_push(current_graph, q, d, p, 1)
                        tried.add((q, p))

        if c <= delta * n_neighbors * n_vertices:
            break

    return deheap_sort(current_graph)
Exemple #12
0
    def nn_descent(
        inds,
        indptr,
        data,
        n_vertices,
        n_neighbors,
        rng_state,
        max_candidates=50,
        n_iters=10,
        delta=0.001,
        rho=0.5,
        rp_tree_init=True,
        leaf_array=None,
        verbose=False,
    ):
        current_graph = make_heap(n_vertices, n_neighbors)
        for i in range(n_vertices):
            indices = rejection_sample(n_neighbors, n_vertices, rng_state)
            for j in range(indices.shape[0]):

                from_inds = inds[indptr[i]:indptr[i + 1]]
                from_data = data[indptr[i]:indptr[i + 1]]

                to_inds = inds[indptr[indices[j]]:indptr[indices[j] + 1]]
                to_data = data[indptr[indices[j]]:indptr[indices[j] + 1]]

                d = sparse_dist(from_inds, from_data, to_inds, to_data,
                                *dist_args)

                heap_push(current_graph, i, d, indices[j], 1)
                heap_push(current_graph, indices[j], d, i, 1)

        if rp_tree_init:
            for n in range(leaf_array.shape[0]):
                for i in range(leaf_array.shape[1]):
                    if leaf_array[n, i] < 0:
                        break
                    for j in range(i + 1, leaf_array.shape[1]):
                        if leaf_array[n, j] < 0:
                            break

                        from_inds = inds[indptr[leaf_array[
                            n, i]]:indptr[leaf_array[n, i] + 1]]
                        from_data = data[indptr[leaf_array[
                            n, i]]:indptr[leaf_array[n, i] + 1]]

                        to_inds = inds[indptr[leaf_array[
                            n, j]]:indptr[leaf_array[n, j] + 1]]
                        to_data = data[indptr[leaf_array[
                            n, j]]:indptr[leaf_array[n, j] + 1]]

                        d = sparse_dist(from_inds, from_data, to_inds, to_data,
                                        *dist_args)

                        heap_push(current_graph, leaf_array[n, i], d,
                                  leaf_array[n, j], 1)
                        heap_push(current_graph, leaf_array[n, j], d,
                                  leaf_array[n, i], 1)

        for n in range(n_iters):
            if verbose:
                print("\t", n, " / ", n_iters)

            candidate_neighbors = build_candidates(current_graph, n_vertices,
                                                   n_neighbors, max_candidates,
                                                   rng_state)

            c = 0
            for i in range(n_vertices):
                for j in range(max_candidates):
                    p = int(candidate_neighbors[0, i, j])
                    if p < 0 or tau_rand(rng_state) < rho:
                        continue
                    for k in range(max_candidates):
                        q = int(candidate_neighbors[0, i, k])
                        if (q < 0 or not candidate_neighbors[2, i, j]
                                and not candidate_neighbors[2, i, k]):
                            continue

                        from_inds = inds[indptr[p]:indptr[p + 1]]
                        from_data = data[indptr[p]:indptr[p + 1]]

                        to_inds = inds[indptr[q]:indptr[q + 1]]
                        to_data = data[indptr[q]:indptr[q + 1]]

                        d = sparse_dist(from_inds, from_data, to_inds, to_data,
                                        *dist_args)

                        c += heap_push(current_graph, p, d, q, 1)
                        c += heap_push(current_graph, q, d, p, 1)

            if c <= delta * n_neighbors * n_vertices:
                break

        return deheap_sort(current_graph)