Example #1
0
def test_new_build_candidates():
    n_vertices = data.shape[0]

    current_graph = pynndescent_.init_current_graph(
        data, dist, dist_args, n_neighbors, rng_state=new_rng_state(), seed_per_row=True
    )
    new_candidate_neighbors, old_candidate_neighbors = utils.new_build_candidates(
        current_graph,
        n_vertices,
        n_neighbors,
        max_candidates,
        rng_state=new_rng_state(),
        seed_per_row=True,
    )

    current_graph = pynndescent_.init_current_graph(
        data, dist, dist_args, n_neighbors, rng_state=new_rng_state(), seed_per_row=True
    )
    parallel = joblib.Parallel(n_jobs=2, prefer="threads")
    new_candidate_neighbors_threaded, old_candidate_neighbors_threaded = threaded.new_build_candidates(
        current_graph,
        n_vertices,
        n_neighbors,
        max_candidates,
        chunk_size=chunk_size,
        rng_state=new_rng_state(),
        rho=0.5,
        parallel=parallel,
        seed_per_row=True,
    )

    assert_allclose(new_candidate_neighbors_threaded, new_candidate_neighbors)
    assert_allclose(old_candidate_neighbors_threaded, old_candidate_neighbors)
Example #2
0
def test_mark_candidate_results():

    np.random.seed(42)
    N = 100
    D = 128
    chunk_size = N // 8
    n_neighbors = 25
    data = np.random.rand(N, D).astype(np.float32)
    n_vertices = data.shape[0]

    current_graph = utils.make_heap(data.shape[0], n_neighbors)
    pynndescent_.init_random(
        n_neighbors,
        data,
        current_graph,
        dist,
        new_rng_state(),
        seed_per_row=True,
    )
    pynndescent_.nn_descent_internal_low_memory_parallel(current_graph,
                                                         data,
                                                         n_neighbors,
                                                         new_rng_state(),
                                                         n_iters=2,
                                                         seed_per_row=True)
    current_graph_threaded = utils.Heap(
        current_graph[0].copy(),
        current_graph[1].copy(),
        current_graph[2].copy(),
    )
    new_candidate_neighbors, old_candidate_neighbors = utils.new_build_candidates(
        current_graph,
        n_vertices,
        n_neighbors,
        max_candidates,
        rng_state=new_rng_state(),
        seed_per_row=True,
    )

    parallel = joblib.Parallel(n_jobs=2, prefer="threads")
    (
        new_candidate_neighbors_threaded,
        old_candidate_neighbors_threaded,
    ) = threaded.new_build_candidates(
        current_graph_threaded,
        n_vertices,
        n_neighbors,
        max_candidates,
        chunk_size=chunk_size,
        rng_state=new_rng_state(),
        parallel=parallel,
        seed_per_row=True,
    )

    assert_allclose(current_graph_threaded, current_graph)
Example #3
0
def test_new_build_candidates():
    np.random.seed(42)
    N = 100
    D = 128
    chunk_size = N // 8
    n_neighbors = 25
    data = np.random.rand(N, D).astype(np.float32)
    n_vertices = data.shape[0]

    current_graph = utils.make_heap(data.shape[0], n_neighbors)
    pynndescent_.init_random(
        n_neighbors,
        data,
        current_graph,
        dist,
        dist_args,
        new_rng_state(),
        seed_per_row=True,
    )
    new_candidate_neighbors, old_candidate_neighbors = utils.new_build_candidates(
        current_graph,
        n_vertices,
        n_neighbors,
        max_candidates,
        rng_state=new_rng_state(),
        seed_per_row=True,
    )

    current_graph = utils.make_heap(data.shape[0], n_neighbors)
    pynndescent_.init_random(
        n_neighbors,
        data,
        current_graph,
        dist,
        dist_args,
        new_rng_state(),
        seed_per_row=True,
    )
    parallel = joblib.Parallel(n_jobs=2, prefer="threads")
    (
        new_candidate_neighbors_threaded,
        old_candidate_neighbors_threaded,
    ) = threaded.new_build_candidates(
        current_graph,
        n_vertices,
        n_neighbors,
        max_candidates,
        chunk_size=chunk_size,
        rng_state=new_rng_state(),
        parallel=parallel,
        seed_per_row=True,
    )

    assert_allclose(new_candidate_neighbors_threaded, new_candidate_neighbors)
    assert_allclose(old_candidate_neighbors_threaded, old_candidate_neighbors)
Example #4
0
def sparse_nn_descent(
    inds,
    indptr,
    data,
    n_vertices,
    n_neighbors,
    rng_state,
    max_candidates=50,
    dist=sparse.sparse_euclidean,
    n_iters=10,
    delta=0.001,
    rp_tree_init=False,
    leaf_array=None,
    verbose=False,
    n_jobs=None,
    seed_per_row=False,
):

    if rng_state is None:
        rng_state = new_rng_state()

    with joblib.Parallel(prefer="threads", n_jobs=n_jobs) as parallel:

        n_tasks = effective_n_jobs_with_context(n_jobs)
        chunk_size = int(math.ceil(n_vertices / n_tasks))

        current_graph = make_heap(n_vertices, n_neighbors)

        if rp_tree_init:
            sparse_init_rp_tree(
                inds,
                indptr,
                data,
                dist,
                current_graph,
                leaf_array,
                chunk_size,
                parallel,
            )

        sparse_init_random(
            current_graph,
            inds,
            indptr,
            data,
            dist,
            n_neighbors,
            chunk_size,
            rng_state,
            parallel,
            seed_per_row=seed_per_row,
        )

        # store the updates in an array
        # note that the factor here is `n_neighbors * n_neighbors`, not `max_candidates * max_candidates`
        # since no more than `n_neighbors` candidates are added for each row
        max_heap_update_count = chunk_size * n_neighbors * n_neighbors * 4
        heap_updates = np.zeros((n_tasks, max_heap_update_count, 4),
                                dtype=np.float32)
        heap_update_counts = np.zeros((n_tasks, ), dtype=np.int64)

        for n in range(n_iters):
            if verbose:
                print("\t", n, " / ", n_iters)

            (new_candidate_neighbors,
             old_candidate_neighbors) = new_build_candidates(
                 current_graph,
                 n_vertices,
                 n_neighbors,
                 max_candidates,
                 chunk_size,
                 rng_state,
                 parallel,
                 seed_per_row=seed_per_row,
             )

            def nn_descent_map(index):
                rows = chunk_rows(chunk_size, index, n_vertices)
                return (
                    index,
                    sparse_nn_descent_map_jit(
                        rows,
                        max_candidates,
                        inds,
                        indptr,
                        data,
                        new_candidate_neighbors,
                        old_candidate_neighbors,
                        heap_updates[index],
                        offset=0,
                        sparse_dist=dist,
                    ),
                )

            def nn_decent_reduce(index):
                return nn_decent_reduce_jit(n_tasks, current_graph,
                                            heap_updates, offsets, index)

            # run map functions
            for index, count in parallel(
                    parallel_calls(nn_descent_map, n_tasks)):
                heap_update_counts[index] = count

            # sort and chunk heap updates so they can be applied in the reduce
            max_count = heap_update_counts.max()
            offsets = np.zeros((n_tasks, max_count), dtype=np.int64)

            def shuffle(index):
                return shuffle_jit(
                    heap_updates,
                    heap_update_counts,
                    offsets,
                    chunk_size,
                    n_vertices,
                    index,
                )

            parallel(parallel_calls(shuffle, n_tasks))

            # then run reduce functions
            c = 0
            for c_part in parallel(parallel_calls(nn_decent_reduce, n_tasks)):
                c += c_part

            if c <= delta * n_neighbors * data.shape[0]:
                break

        def deheap_sort_map(index):
            rows = chunk_rows(chunk_size, index, n_vertices)
            return index, deheap_sort_map_jit(rows, current_graph)

        parallel(parallel_calls(deheap_sort_map, n_tasks))
        return current_graph[0].astype(np.int64), current_graph[1]