def test_new_build_candidates(): n_vertices = data.shape[0] current_graph = pynndescent_.init_current_graph( data, dist, dist_args, n_neighbors, rng_state=new_rng_state(), seed_per_row=True ) new_candidate_neighbors, old_candidate_neighbors = utils.new_build_candidates( current_graph, n_vertices, n_neighbors, max_candidates, rng_state=new_rng_state(), seed_per_row=True, ) current_graph = pynndescent_.init_current_graph( data, dist, dist_args, n_neighbors, rng_state=new_rng_state(), seed_per_row=True ) parallel = joblib.Parallel(n_jobs=2, prefer="threads") new_candidate_neighbors_threaded, old_candidate_neighbors_threaded = threaded.new_build_candidates( current_graph, n_vertices, n_neighbors, max_candidates, chunk_size=chunk_size, rng_state=new_rng_state(), rho=0.5, parallel=parallel, seed_per_row=True, ) assert_allclose(new_candidate_neighbors_threaded, new_candidate_neighbors) assert_allclose(old_candidate_neighbors_threaded, old_candidate_neighbors)
def nn_descent_internal_low_memory_parallel( current_graph, inds, indptr, data, n_neighbors, rng_state, max_candidates=50, dist=sparse_euclidean, dist_args=(), n_iters=10, delta=0.001, verbose=False, seed_per_row=False, ): n_vertices = indptr.shape[0] - 1 block_size = 16384 n_blocks = n_vertices // block_size for n in range(n_iters): if verbose: print("\t", n, " / ", n_iters) (new_candidate_neighbors, old_candidate_neighbors) = new_build_candidates( current_graph, n_vertices, n_neighbors, max_candidates, rng_state, seed_per_row, ) c = 0 for i in range(n_blocks + 1): block_start = i * block_size block_end = min(n_vertices, (i + 1) * block_size) new_candidate_block = new_candidate_neighbors[ 0, block_start:block_end] old_candidate_block = old_candidate_neighbors[ 0, block_start:block_end] dist_thresholds = current_graph[1, :, 0] updates = generate_graph_updates( new_candidate_block, old_candidate_block, dist_thresholds, inds, indptr, data, dist, dist_args, ) c += apply_graph_updates_low_memory(current_graph, updates) if c <= delta * n_neighbors * n_vertices: return
def nn_descent_internal_low_memory( current_graph, data, n_neighbors, rng_state, max_candidates=50, dist=dist.euclidean, dist_args=(), n_iters=10, delta=0.001, rho=0.5, verbose=False, seed_per_row=False, ): n_vertices = data.shape[0] for n in range(n_iters): if verbose: print("\t", n, " / ", n_iters) (new_candidate_neighbors, old_candidate_neighbors) = new_build_candidates( current_graph, n_vertices, n_neighbors, max_candidates, rng_state, rho, seed_per_row, ) c = 0 for i in range(n_vertices): for j in range(max_candidates): p = int(new_candidate_neighbors[0, i, j]) if p < 0: continue for k in range(j, max_candidates): q = int(new_candidate_neighbors[0, i, k]) if q < 0: continue d = dist(data[p], data[q], *dist_args) c += heap_push(current_graph, p, d, q, 1) if p != q: c += heap_push(current_graph, q, d, p, 1) for k in range(max_candidates): q = int(old_candidate_neighbors[0, i, k]) if q < 0: continue d = dist(data[p], data[q], *dist_args) c += heap_push(current_graph, p, d, q, 1) if p != q: c += heap_push(current_graph, q, d, p, 1) if c <= delta * n_neighbors * data.shape[0]: return
def nn_descent_internal_high_memory_parallel( current_graph, inds, indptr, data, n_neighbors, rng_state, max_candidates=50, dist=sparse_euclidean, n_iters=10, delta=0.001, verbose=False, ): n_vertices = indptr.shape[0] - 1 block_size = 16384 n_blocks = n_vertices // block_size n_threads = numba.get_num_threads() in_graph = [ set(current_graph[0][i].astype(np.int64)) for i in range(current_graph[0].shape[0]) ] for n in range(n_iters): if verbose: print("\t", n + 1, " / ", n_iters) (new_candidate_neighbors, old_candidate_neighbors) = new_build_candidates( current_graph, max_candidates, rng_state, n_threads ) c = 0 for i in range(n_blocks + 1): block_start = i * block_size block_end = min(n_vertices, (i + 1) * block_size) new_candidate_block = new_candidate_neighbors[block_start:block_end] old_candidate_block = old_candidate_neighbors[block_start:block_end] dist_thresholds = current_graph[1][:, 0] updates = generate_graph_updates( new_candidate_block, old_candidate_block, dist_thresholds, inds, indptr, data, dist, ) c += apply_graph_updates_high_memory(current_graph, updates, in_graph) if c <= delta * n_neighbors * n_vertices: if verbose: print("\tStopping threshold met -- exiting after", n + 1, "iterations") return
def test_mark_candidate_results(): np.random.seed(42) N = 100 D = 128 chunk_size = N // 8 n_neighbors = 25 data = np.random.rand(N, D).astype(np.float32) n_vertices = data.shape[0] current_graph = utils.make_heap(data.shape[0], n_neighbors) pynndescent_.init_random( n_neighbors, data, current_graph, dist, new_rng_state(), seed_per_row=True, ) pynndescent_.nn_descent_internal_low_memory_parallel(current_graph, data, n_neighbors, new_rng_state(), n_iters=2, seed_per_row=True) current_graph_threaded = utils.Heap( current_graph[0].copy(), current_graph[1].copy(), current_graph[2].copy(), ) new_candidate_neighbors, old_candidate_neighbors = utils.new_build_candidates( current_graph, n_vertices, n_neighbors, max_candidates, rng_state=new_rng_state(), seed_per_row=True, ) parallel = joblib.Parallel(n_jobs=2, prefer="threads") ( new_candidate_neighbors_threaded, old_candidate_neighbors_threaded, ) = threaded.new_build_candidates( current_graph_threaded, n_vertices, n_neighbors, max_candidates, chunk_size=chunk_size, rng_state=new_rng_state(), parallel=parallel, seed_per_row=True, ) assert_allclose(current_graph_threaded, current_graph)
def test_new_build_candidates(): np.random.seed(42) N = 100 D = 128 chunk_size = N // 8 n_neighbors = 25 data = np.random.rand(N, D).astype(np.float32) n_vertices = data.shape[0] current_graph = utils.make_heap(data.shape[0], n_neighbors) pynndescent_.init_random( n_neighbors, data, current_graph, dist, dist_args, new_rng_state(), seed_per_row=True, ) new_candidate_neighbors, old_candidate_neighbors = utils.new_build_candidates( current_graph, n_vertices, n_neighbors, max_candidates, rng_state=new_rng_state(), seed_per_row=True, ) current_graph = utils.make_heap(data.shape[0], n_neighbors) pynndescent_.init_random( n_neighbors, data, current_graph, dist, dist_args, new_rng_state(), seed_per_row=True, ) parallel = joblib.Parallel(n_jobs=2, prefer="threads") ( new_candidate_neighbors_threaded, old_candidate_neighbors_threaded, ) = threaded.new_build_candidates( current_graph, n_vertices, n_neighbors, max_candidates, chunk_size=chunk_size, rng_state=new_rng_state(), parallel=parallel, seed_per_row=True, ) assert_allclose(new_candidate_neighbors_threaded, new_candidate_neighbors) assert_allclose(old_candidate_neighbors_threaded, old_candidate_neighbors)
def nn_descent( data, n_neighbors, rng_state, max_candidates=50, dist=dist.euclidean, dist_args=(), n_iters=10, delta=0.001, rho=0.5, rp_tree_init=True, leaf_array=None, verbose=False, seed_per_row=False, ): n_vertices = data.shape[0] tried = set([(-1, -1)]) current_graph = make_heap(data.shape[0], n_neighbors) for i in range(data.shape[0]): if seed_per_row: seed(rng_state, i) indices = rejection_sample(n_neighbors, data.shape[0], rng_state) for j in range(indices.shape[0]): d = dist(data[i], data[indices[j]], *dist_args) heap_push(current_graph, i, d, indices[j], 1) heap_push(current_graph, indices[j], d, i, 1) tried.add((i, indices[j])) tried.add((indices[j], i)) if rp_tree_init: init_rp_tree(data, dist, dist_args, current_graph, leaf_array, tried=tried) for n in range(n_iters): if verbose: print("\t", n, " / ", n_iters) (new_candidate_neighbors, old_candidate_neighbors) = new_build_candidates( current_graph, n_vertices, n_neighbors, max_candidates, rng_state, rho, seed_per_row, ) c = 0 for i in range(n_vertices): for j in range(max_candidates): p = int(new_candidate_neighbors[0, i, j]) if p < 0: continue for k in range(j, max_candidates): q = int(new_candidate_neighbors[0, i, k]) if q < 0 or (p, q) in tried: continue d = dist(data[p], data[q], *dist_args) c += unchecked_heap_push(current_graph, p, d, q, 1) tried.add((p, q)) if p != q: c += unchecked_heap_push(current_graph, q, d, p, 1) tried.add((q, p)) for k in range(max_candidates): q = int(old_candidate_neighbors[0, i, k]) if q < 0 or (p, q) in tried: continue d = dist(data[p], data[q], *dist_args) c += unchecked_heap_push(current_graph, p, d, q, 1) tried.add((p, q)) if p != q: c += unchecked_heap_push(current_graph, q, d, p, 1) tried.add((q, p)) if c <= delta * n_neighbors * data.shape[0]: break return deheap_sort(current_graph)
def sparse_nn_descent_internal_low_memory( current_graph, inds, indptr, data, n_vertices, n_neighbors, rng_state, max_candidates=50, sparse_dist=sparse_euclidean, dist_args=(), n_iters=10, delta=0.001, rho=0.5, verbose=False, ): for n in range(n_iters): if verbose: print("\t", n, " / ", n_iters) (new_candidate_neighbors, old_candidate_neighbors) = new_build_candidates( current_graph, n_vertices, n_neighbors, max_candidates, rng_state, rho, False, ) c = 0 for i in range(n_vertices): for j in range(max_candidates): p = int(new_candidate_neighbors[0, i, j]) if p < 0: continue for k in range(j, max_candidates): q = int(new_candidate_neighbors[0, i, k]) if q < 0: continue from_inds = inds[indptr[p]:indptr[p + 1]] from_data = data[indptr[p]:indptr[p + 1]] to_inds = inds[indptr[q]:indptr[q + 1]] to_data = data[indptr[q]:indptr[q + 1]] d = sparse_dist(from_inds, from_data, to_inds, to_data, *dist_args) c += heap_push(current_graph, p, d, q, 1) if p != q: c += heap_push(current_graph, q, d, p, 1) for k in range(max_candidates): q = int(old_candidate_neighbors[0, i, k]) if q < 0: continue from_inds = inds[indptr[p]:indptr[p + 1]] from_data = data[indptr[p]:indptr[p + 1]] to_inds = inds[indptr[q]:indptr[q + 1]] to_data = data[indptr[q]:indptr[q + 1]] d = sparse_dist(from_inds, from_data, to_inds, to_data, *dist_args) c += heap_push(current_graph, p, d, q, 1) if p != q: c += heap_push(current_graph, q, d, p, 1) if c <= delta * n_neighbors * n_vertices: return
def sparse_nn_descent( inds, indptr, data, n_vertices, n_neighbors, rng_state, max_candidates=50, sparse_dist=sparse_euclidean, dist_args=(), n_iters=10, delta=0.001, rho=0.5, rp_tree_init=True, leaf_array=None, verbose=False, ): tried = set([(-1, -1)]) current_graph = make_heap(n_vertices, n_neighbors) for i in range(n_vertices): indices = rejection_sample(n_neighbors, n_vertices, rng_state) for j in range(indices.shape[0]): from_inds = inds[indptr[i]:indptr[i + 1]] from_data = data[indptr[i]:indptr[i + 1]] to_inds = inds[indptr[indices[j]]:indptr[indices[j] + 1]] to_data = data[indptr[indices[j]]:indptr[indices[j] + 1]] d = sparse_dist(from_inds, from_data, to_inds, to_data, *dist_args) heap_push(current_graph, i, d, indices[j], 1) heap_push(current_graph, indices[j], d, i, 1) tried.add((i, indices[j])) tried.add((indices[j], i)) if rp_tree_init: sparse_init_rp_tree( inds, indptr, data, sparse_dist, dist_args, current_graph, leaf_array, tried=tried, ) for n in range(n_iters): if verbose: print("\t", n, " / ", n_iters) (new_candidate_neighbors, old_candidate_neighbors) = new_build_candidates( current_graph, n_vertices, n_neighbors, max_candidates, rng_state, rho, False, ) c = 0 for i in range(n_vertices): for j in range(max_candidates): p = int(new_candidate_neighbors[0, i, j]) if p < 0: continue for k in range(j, max_candidates): q = int(new_candidate_neighbors[0, i, k]) if q < 0 or (p, q) in tried: continue from_inds = inds[indptr[p]:indptr[p + 1]] from_data = data[indptr[p]:indptr[p + 1]] to_inds = inds[indptr[q]:indptr[q + 1]] to_data = data[indptr[q]:indptr[q + 1]] d = sparse_dist(from_inds, from_data, to_inds, to_data, *dist_args) c += unchecked_heap_push(current_graph, p, d, q, 1) tried.add((p, q)) if p != q: c += unchecked_heap_push(current_graph, q, d, p, 1) tried.add((q, p)) for k in range(max_candidates): q = int(old_candidate_neighbors[0, i, k]) if q < 0 or (p, q) in tried: continue from_inds = inds[indptr[p]:indptr[p + 1]] from_data = data[indptr[p]:indptr[p + 1]] to_inds = inds[indptr[q]:indptr[q + 1]] to_data = data[indptr[q]:indptr[q + 1]] d = sparse_dist(from_inds, from_data, to_inds, to_data, *dist_args) c += unchecked_heap_push(current_graph, p, d, q, 1) tried.add((p, q)) if p != q: c += unchecked_heap_push(current_graph, q, d, p, 1) tried.add((q, p)) if c <= delta * n_neighbors * n_vertices: break return deheap_sort(current_graph)