Example #1
0
def test_new_build_candidates():
    n_vertices = data.shape[0]

    current_graph = pynndescent_.init_current_graph(
        data, dist, dist_args, n_neighbors, rng_state=new_rng_state(), seed_per_row=True
    )
    new_candidate_neighbors, old_candidate_neighbors = utils.new_build_candidates(
        current_graph,
        n_vertices,
        n_neighbors,
        max_candidates,
        rng_state=new_rng_state(),
        seed_per_row=True,
    )

    current_graph = pynndescent_.init_current_graph(
        data, dist, dist_args, n_neighbors, rng_state=new_rng_state(), seed_per_row=True
    )
    parallel = joblib.Parallel(n_jobs=2, prefer="threads")
    new_candidate_neighbors_threaded, old_candidate_neighbors_threaded = threaded.new_build_candidates(
        current_graph,
        n_vertices,
        n_neighbors,
        max_candidates,
        chunk_size=chunk_size,
        rng_state=new_rng_state(),
        rho=0.5,
        parallel=parallel,
        seed_per_row=True,
    )

    assert_allclose(new_candidate_neighbors_threaded, new_candidate_neighbors)
    assert_allclose(old_candidate_neighbors_threaded, old_candidate_neighbors)
Example #2
0
def nn_descent_internal_low_memory_parallel(
    current_graph,
    inds,
    indptr,
    data,
    n_neighbors,
    rng_state,
    max_candidates=50,
    dist=sparse_euclidean,
    dist_args=(),
    n_iters=10,
    delta=0.001,
    verbose=False,
    seed_per_row=False,
):
    n_vertices = indptr.shape[0] - 1
    block_size = 16384
    n_blocks = n_vertices // block_size

    for n in range(n_iters):
        if verbose:
            print("\t", n, " / ", n_iters)

        (new_candidate_neighbors,
         old_candidate_neighbors) = new_build_candidates(
             current_graph,
             n_vertices,
             n_neighbors,
             max_candidates,
             rng_state,
             seed_per_row,
         )

        c = 0
        for i in range(n_blocks + 1):
            block_start = i * block_size
            block_end = min(n_vertices, (i + 1) * block_size)

            new_candidate_block = new_candidate_neighbors[
                0, block_start:block_end]
            old_candidate_block = old_candidate_neighbors[
                0, block_start:block_end]
            dist_thresholds = current_graph[1, :, 0]

            updates = generate_graph_updates(
                new_candidate_block,
                old_candidate_block,
                dist_thresholds,
                inds,
                indptr,
                data,
                dist,
                dist_args,
            )

            c += apply_graph_updates_low_memory(current_graph, updates)

        if c <= delta * n_neighbors * n_vertices:
            return
Example #3
0
def nn_descent_internal_low_memory(
    current_graph,
    data,
    n_neighbors,
    rng_state,
    max_candidates=50,
    dist=dist.euclidean,
    dist_args=(),
    n_iters=10,
    delta=0.001,
    rho=0.5,
    verbose=False,
    seed_per_row=False,
):
    n_vertices = data.shape[0]

    for n in range(n_iters):
        if verbose:
            print("\t", n, " / ", n_iters)

        (new_candidate_neighbors,
         old_candidate_neighbors) = new_build_candidates(
             current_graph,
             n_vertices,
             n_neighbors,
             max_candidates,
             rng_state,
             rho,
             seed_per_row,
         )

        c = 0
        for i in range(n_vertices):
            for j in range(max_candidates):
                p = int(new_candidate_neighbors[0, i, j])
                if p < 0:
                    continue
                for k in range(j, max_candidates):
                    q = int(new_candidate_neighbors[0, i, k])
                    if q < 0:
                        continue

                    d = dist(data[p], data[q], *dist_args)
                    c += heap_push(current_graph, p, d, q, 1)
                    if p != q:
                        c += heap_push(current_graph, q, d, p, 1)

                for k in range(max_candidates):
                    q = int(old_candidate_neighbors[0, i, k])
                    if q < 0:
                        continue

                    d = dist(data[p], data[q], *dist_args)
                    c += heap_push(current_graph, p, d, q, 1)
                    if p != q:
                        c += heap_push(current_graph, q, d, p, 1)

        if c <= delta * n_neighbors * data.shape[0]:
            return
Example #4
0
def nn_descent_internal_high_memory_parallel(
    current_graph,
    inds,
    indptr,
    data,
    n_neighbors,
    rng_state,
    max_candidates=50,
    dist=sparse_euclidean,
    n_iters=10,
    delta=0.001,
    verbose=False,
):
    n_vertices = indptr.shape[0] - 1
    block_size = 16384
    n_blocks = n_vertices // block_size
    n_threads = numba.get_num_threads()

    in_graph = [
        set(current_graph[0][i].astype(np.int64))
        for i in range(current_graph[0].shape[0])
    ]

    for n in range(n_iters):
        if verbose:
            print("\t", n + 1, " / ", n_iters)

        (new_candidate_neighbors, old_candidate_neighbors) = new_build_candidates(
            current_graph, max_candidates, rng_state, n_threads
        )

        c = 0
        for i in range(n_blocks + 1):
            block_start = i * block_size
            block_end = min(n_vertices, (i + 1) * block_size)

            new_candidate_block = new_candidate_neighbors[block_start:block_end]
            old_candidate_block = old_candidate_neighbors[block_start:block_end]
            dist_thresholds = current_graph[1][:, 0]

            updates = generate_graph_updates(
                new_candidate_block,
                old_candidate_block,
                dist_thresholds,
                inds,
                indptr,
                data,
                dist,
            )

            c += apply_graph_updates_high_memory(current_graph, updates, in_graph)

        if c <= delta * n_neighbors * n_vertices:
            if verbose:
                print("\tStopping threshold met -- exiting after", n + 1, "iterations")
            return
Example #5
0
def test_mark_candidate_results():

    np.random.seed(42)
    N = 100
    D = 128
    chunk_size = N // 8
    n_neighbors = 25
    data = np.random.rand(N, D).astype(np.float32)
    n_vertices = data.shape[0]

    current_graph = utils.make_heap(data.shape[0], n_neighbors)
    pynndescent_.init_random(
        n_neighbors,
        data,
        current_graph,
        dist,
        new_rng_state(),
        seed_per_row=True,
    )
    pynndescent_.nn_descent_internal_low_memory_parallel(current_graph,
                                                         data,
                                                         n_neighbors,
                                                         new_rng_state(),
                                                         n_iters=2,
                                                         seed_per_row=True)
    current_graph_threaded = utils.Heap(
        current_graph[0].copy(),
        current_graph[1].copy(),
        current_graph[2].copy(),
    )
    new_candidate_neighbors, old_candidate_neighbors = utils.new_build_candidates(
        current_graph,
        n_vertices,
        n_neighbors,
        max_candidates,
        rng_state=new_rng_state(),
        seed_per_row=True,
    )

    parallel = joblib.Parallel(n_jobs=2, prefer="threads")
    (
        new_candidate_neighbors_threaded,
        old_candidate_neighbors_threaded,
    ) = threaded.new_build_candidates(
        current_graph_threaded,
        n_vertices,
        n_neighbors,
        max_candidates,
        chunk_size=chunk_size,
        rng_state=new_rng_state(),
        parallel=parallel,
        seed_per_row=True,
    )

    assert_allclose(current_graph_threaded, current_graph)
Example #6
0
def test_new_build_candidates():
    np.random.seed(42)
    N = 100
    D = 128
    chunk_size = N // 8
    n_neighbors = 25
    data = np.random.rand(N, D).astype(np.float32)
    n_vertices = data.shape[0]

    current_graph = utils.make_heap(data.shape[0], n_neighbors)
    pynndescent_.init_random(
        n_neighbors,
        data,
        current_graph,
        dist,
        dist_args,
        new_rng_state(),
        seed_per_row=True,
    )
    new_candidate_neighbors, old_candidate_neighbors = utils.new_build_candidates(
        current_graph,
        n_vertices,
        n_neighbors,
        max_candidates,
        rng_state=new_rng_state(),
        seed_per_row=True,
    )

    current_graph = utils.make_heap(data.shape[0], n_neighbors)
    pynndescent_.init_random(
        n_neighbors,
        data,
        current_graph,
        dist,
        dist_args,
        new_rng_state(),
        seed_per_row=True,
    )
    parallel = joblib.Parallel(n_jobs=2, prefer="threads")
    (
        new_candidate_neighbors_threaded,
        old_candidate_neighbors_threaded,
    ) = threaded.new_build_candidates(
        current_graph,
        n_vertices,
        n_neighbors,
        max_candidates,
        chunk_size=chunk_size,
        rng_state=new_rng_state(),
        parallel=parallel,
        seed_per_row=True,
    )

    assert_allclose(new_candidate_neighbors_threaded, new_candidate_neighbors)
    assert_allclose(old_candidate_neighbors_threaded, old_candidate_neighbors)
Example #7
0
def nn_descent(
    data,
    n_neighbors,
    rng_state,
    max_candidates=50,
    dist=dist.euclidean,
    dist_args=(),
    n_iters=10,
    delta=0.001,
    rho=0.5,
    rp_tree_init=True,
    leaf_array=None,
    verbose=False,
    seed_per_row=False,
):
    n_vertices = data.shape[0]
    tried = set([(-1, -1)])

    current_graph = make_heap(data.shape[0], n_neighbors)
    for i in range(data.shape[0]):
        if seed_per_row:
            seed(rng_state, i)
        indices = rejection_sample(n_neighbors, data.shape[0], rng_state)
        for j in range(indices.shape[0]):
            d = dist(data[i], data[indices[j]], *dist_args)
            heap_push(current_graph, i, d, indices[j], 1)
            heap_push(current_graph, indices[j], d, i, 1)
            tried.add((i, indices[j]))
            tried.add((indices[j], i))

    if rp_tree_init:
        init_rp_tree(data,
                     dist,
                     dist_args,
                     current_graph,
                     leaf_array,
                     tried=tried)

    for n in range(n_iters):
        if verbose:
            print("\t", n, " / ", n_iters)

        (new_candidate_neighbors,
         old_candidate_neighbors) = new_build_candidates(
             current_graph,
             n_vertices,
             n_neighbors,
             max_candidates,
             rng_state,
             rho,
             seed_per_row,
         )

        c = 0
        for i in range(n_vertices):
            for j in range(max_candidates):
                p = int(new_candidate_neighbors[0, i, j])
                if p < 0:
                    continue
                for k in range(j, max_candidates):
                    q = int(new_candidate_neighbors[0, i, k])
                    if q < 0 or (p, q) in tried:
                        continue

                    d = dist(data[p], data[q], *dist_args)
                    c += unchecked_heap_push(current_graph, p, d, q, 1)
                    tried.add((p, q))
                    if p != q:
                        c += unchecked_heap_push(current_graph, q, d, p, 1)
                        tried.add((q, p))

                for k in range(max_candidates):
                    q = int(old_candidate_neighbors[0, i, k])
                    if q < 0 or (p, q) in tried:
                        continue

                    d = dist(data[p], data[q], *dist_args)
                    c += unchecked_heap_push(current_graph, p, d, q, 1)
                    tried.add((p, q))
                    if p != q:
                        c += unchecked_heap_push(current_graph, q, d, p, 1)
                        tried.add((q, p))

        if c <= delta * n_neighbors * data.shape[0]:
            break

    return deheap_sort(current_graph)
Example #8
0
def sparse_nn_descent_internal_low_memory(
        current_graph,
        inds,
        indptr,
        data,
        n_vertices,
        n_neighbors,
        rng_state,
        max_candidates=50,
        sparse_dist=sparse_euclidean,
        dist_args=(),
        n_iters=10,
        delta=0.001,
        rho=0.5,
        verbose=False,
):
    for n in range(n_iters):
        if verbose:
            print("\t", n, " / ", n_iters)

        (new_candidate_neighbors,
         old_candidate_neighbors) = new_build_candidates(
             current_graph,
             n_vertices,
             n_neighbors,
             max_candidates,
             rng_state,
             rho,
             False,
         )

        c = 0
        for i in range(n_vertices):
            for j in range(max_candidates):
                p = int(new_candidate_neighbors[0, i, j])
                if p < 0:
                    continue
                for k in range(j, max_candidates):
                    q = int(new_candidate_neighbors[0, i, k])
                    if q < 0:
                        continue

                    from_inds = inds[indptr[p]:indptr[p + 1]]
                    from_data = data[indptr[p]:indptr[p + 1]]

                    to_inds = inds[indptr[q]:indptr[q + 1]]
                    to_data = data[indptr[q]:indptr[q + 1]]

                    d = sparse_dist(from_inds, from_data, to_inds, to_data,
                                    *dist_args)

                    c += heap_push(current_graph, p, d, q, 1)
                    if p != q:
                        c += heap_push(current_graph, q, d, p, 1)

                for k in range(max_candidates):
                    q = int(old_candidate_neighbors[0, i, k])
                    if q < 0:
                        continue

                    from_inds = inds[indptr[p]:indptr[p + 1]]
                    from_data = data[indptr[p]:indptr[p + 1]]

                    to_inds = inds[indptr[q]:indptr[q + 1]]
                    to_data = data[indptr[q]:indptr[q + 1]]

                    d = sparse_dist(from_inds, from_data, to_inds, to_data,
                                    *dist_args)

                    c += heap_push(current_graph, p, d, q, 1)
                    if p != q:
                        c += heap_push(current_graph, q, d, p, 1)

        if c <= delta * n_neighbors * n_vertices:
            return
Example #9
0
def sparse_nn_descent(
    inds,
    indptr,
    data,
    n_vertices,
    n_neighbors,
    rng_state,
    max_candidates=50,
    sparse_dist=sparse_euclidean,
    dist_args=(),
    n_iters=10,
    delta=0.001,
    rho=0.5,
    rp_tree_init=True,
    leaf_array=None,
    verbose=False,
):

    tried = set([(-1, -1)])

    current_graph = make_heap(n_vertices, n_neighbors)
    for i in range(n_vertices):
        indices = rejection_sample(n_neighbors, n_vertices, rng_state)
        for j in range(indices.shape[0]):

            from_inds = inds[indptr[i]:indptr[i + 1]]
            from_data = data[indptr[i]:indptr[i + 1]]

            to_inds = inds[indptr[indices[j]]:indptr[indices[j] + 1]]
            to_data = data[indptr[indices[j]]:indptr[indices[j] + 1]]

            d = sparse_dist(from_inds, from_data, to_inds, to_data, *dist_args)

            heap_push(current_graph, i, d, indices[j], 1)
            heap_push(current_graph, indices[j], d, i, 1)
            tried.add((i, indices[j]))
            tried.add((indices[j], i))

    if rp_tree_init:
        sparse_init_rp_tree(
            inds,
            indptr,
            data,
            sparse_dist,
            dist_args,
            current_graph,
            leaf_array,
            tried=tried,
        )

    for n in range(n_iters):
        if verbose:
            print("\t", n, " / ", n_iters)

        (new_candidate_neighbors,
         old_candidate_neighbors) = new_build_candidates(
             current_graph,
             n_vertices,
             n_neighbors,
             max_candidates,
             rng_state,
             rho,
             False,
         )

        c = 0
        for i in range(n_vertices):
            for j in range(max_candidates):
                p = int(new_candidate_neighbors[0, i, j])
                if p < 0:
                    continue
                for k in range(j, max_candidates):
                    q = int(new_candidate_neighbors[0, i, k])
                    if q < 0 or (p, q) in tried:
                        continue

                    from_inds = inds[indptr[p]:indptr[p + 1]]
                    from_data = data[indptr[p]:indptr[p + 1]]

                    to_inds = inds[indptr[q]:indptr[q + 1]]
                    to_data = data[indptr[q]:indptr[q + 1]]

                    d = sparse_dist(from_inds, from_data, to_inds, to_data,
                                    *dist_args)

                    c += unchecked_heap_push(current_graph, p, d, q, 1)
                    tried.add((p, q))
                    if p != q:
                        c += unchecked_heap_push(current_graph, q, d, p, 1)
                        tried.add((q, p))

                for k in range(max_candidates):
                    q = int(old_candidate_neighbors[0, i, k])
                    if q < 0 or (p, q) in tried:
                        continue

                    from_inds = inds[indptr[p]:indptr[p + 1]]
                    from_data = data[indptr[p]:indptr[p + 1]]

                    to_inds = inds[indptr[q]:indptr[q + 1]]
                    to_data = data[indptr[q]:indptr[q + 1]]

                    d = sparse_dist(from_inds, from_data, to_inds, to_data,
                                    *dist_args)

                    c += unchecked_heap_push(current_graph, p, d, q, 1)
                    tried.add((p, q))
                    if p != q:
                        c += unchecked_heap_push(current_graph, q, d, p, 1)
                        tried.add((q, p))

        if c <= delta * n_neighbors * n_vertices:
            break

    return deheap_sort(current_graph)