def find_edges(input, test, K): print(f"building kNN classifier ... ", end=" ") st_time = time.time() if kNN_type <= 3: input, test = input.todense(), test.todense() if kNN_type == 1: from sklearn.neighbors import NearestNeighbors tree = NearestNeighbors(n_neighbors=K + 1, algorithm='ball_tree').fit(input) elif kNN_type == 2: from scipy import spatial tree = spatial.KDTree(input) elif kNN_type == 3: from n2 import HnswIndex tree = HnswIndex(input.shape[1], distance_type) # distance_type in ['angular', 'L2'] for index in tqdm(range(input.shape[0])): tree.add_data(input[index, :]) tree.build(n_threads=10) elif kNN_type == 4: import pysparnn.cluster_index as ci input_num = input.shape[0] tree = ci.MultiClusterIndex(input, range(input_num)) else: raise NotImplementedError print(f"time={time.time()-st_time:.3f}s") print("finding indices ... ", end=" ") if kNN_type == 1: _, indices = tree.kneighbors(test) elif kNN_type == 2: _, indices = tree.query(test, k=K + 1) elif kNN_type == 3: indices = [] for i in tqdm(range(test.shape[0])): indices.append(tree.search_by_vector(test[i, :], k=K + 1)) else: indices = tree.search(test, k=K+1, k_clusters=100, return_distance=False) print(f"time={time.time()-st_time:.3f}s") edge_list = [] for index1, per in enumerate(indices): for index2 in per: index2 = int(index2) if index1 != index2: edge_list.append((index1, index2)) print(f"done! .... time={time.time()-st_time:.3f}s") return edge_list
def find_edges(input, test, K): print(f"\tbuilding kNN classifier ... ", end=" ") st_time = time.time() if kNN_type in [1, 2]: input, test = input.todense(), test.todense() if kNN_type == 1: from sklearn.neighbors import NearestNeighbors tree = NearestNeighbors(n_neighbors=K + 1, algorithm='ball_tree').fit(input) elif kNN_type == 2: from scipy import spatial tree = spatial.KDTree(input) elif kNN_type == 3: from n2 import HnswIndex tree = HnswIndex(input.shape[1], distance_type) # distance_type in ['angular', 'L2'] for index in tqdm(range(input.shape[0])): tree.add_data(input[index, :]) tree.build(n_threads=20) elif kNN_type == 4: import pysparnn.cluster_index as ci input_num = input.shape[0] tree = ci.MultiClusterIndex(input, range(input_num)) elif kNN_type == 5: import nmslib M, efC = 30, 100 index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post': 0} space_names = ['l2_sparse', 'cosinesimil_sparse'] # https://github.com/nmslib/nmslib/blob/master/manual/spaces.md space_name = space_names[0] data_type = nmslib.DataType.SPARSE_VECTOR tree = nmslib.init(method='hnsw', space=space_name, data_type=data_type) ''' def calc_zero_rows(i): if input[i, :].getnnz() == 0: return 1 else: return 0 pool = Pool(num_threads) zero_row_num = sum(pool.map(calc_zero_rows, range(input.shape[0]))) print(f"# zero rows in input = {zero_row_num}", end=" ") ''' tree.addDataPointBatch(input) tree.createIndex(index_time_params, print_progress=True) # Setting query-time parameters efS = 100 query_time_params = {'efSearch': efS} print('Setting query-time parameters', query_time_params, end=" ") tree.setQueryTimeParams(query_time_params) else: raise NotImplementedError print(f"time={time.time()-st_time:.3f}s") print("\tfinding indices ... ", end=" ") if kNN_type == 1: _, indices = tree.kneighbors(test) elif kNN_type == 2: _, indices = tree.query(test, k=K + 1) elif kNN_type == 3: indices = [] for i in tqdm(range(test.shape[0])): indices.append(tree.search_by_vector(test[i, :], k=K + 1)) elif kNN_type == 4: indices = tree.search(test, k=K+1, k_clusters=100, return_distance=False) elif kNN_type == 5: ''' def calc_zero_rows2(i): if test[i, :].getnnz() == 0: return 1 else: return 0 pool = Pool(num_threads) zero_row_num = sum(pool.map(calc_zero_rows2, range(test.shape[0]))) print(f"# zero rows in test = {zero_row_num}") ''' indices_ = tree.knnQueryBatch(test, k=K+1, num_threads=num_threads) indices = [i[0] for i in indices_] del indices_ else: raise NotImplementedError print(f"time={time.time()-st_time:.3f}s") edge_list = [] for index1, per in enumerate(indices): assert len(per) == K+1, f"index1={index1} len(per)={len(per)} != K={K}" for index2 in per: index2 = int(index2) if index1 != index2: edge_list.append((index1, index2)) print(f"\tget edges done! .... time={time.time()-st_time:.3f}s") return edge_list
def find_edges(input, test, K, cluster_ids, query_ids): print(f"\tbuilding kNN classifier ... ", end=" ") st_time = time.time() if kNN_type in [1, 2]: input, test = input.todense(), test.todense() if kNN_type == 1: from sklearn.neighbors import NearestNeighbors tree = NearestNeighbors(n_neighbors=K + 1, algorithm='ball_tree').fit(input) elif kNN_type == 2: from scipy import spatial tree = spatial.KDTree(input) elif kNN_type == 3: from n2 import HnswIndex tree = HnswIndex(input.shape[1], distance_type) # distance_type in ['angular', 'L2'] for index in tqdm(range(input.shape[0])): tree.add_data(input[index, :]) tree.build(n_threads=20) elif kNN_type == 4: import pysparnn.cluster_index as ci input_num = input.shape[0] tree = ci.MultiClusterIndex(input, range(input_num)) elif kNN_type == 5: import nmslib M, efC, num_threads = 30, 100, 10 index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post': 0} space_name = 'cosinesimil_sparse' data_type = nmslib.DataType.SPARSE_VECTOR tree = nmslib.init(method='hnsw', space=space_name, data_type=data_type) print(f"type(input) = {type(input)} type(test)={type(test)}", end=" ") tree.addDataPointBatch(input) tree.createIndex(index_time_params) # Setting query-time parameters efS = 100 query_time_params = {'efSearch': efS} print('Setting query-time parameters', query_time_params) tree.setQueryTimeParams(query_time_params) else: raise NotImplementedError print(f"time={time.time()-st_time:.3f}s") print("\tfinding indices ... ", end=" ") if kNN_type == 1: _, indices = tree.kneighbors(test) elif kNN_type == 2: _, indices = tree.query(test, k=K + 1) elif kNN_type == 3: indices = [] for i in tqdm(range(test.shape[0])): indices.append(tree.search_by_vector(test[i, :], k=K + 1)) elif kNN_type == 4: indices = tree.search(test, k=K+1, k_clusters=100, return_distance=False) elif kNN_type == 5: indices_ = tree.knnQueryBatch(test, k=K, num_threads=num_threads) indices = [i[0] for i in indices_] del indices_ else: raise NotImplementedError print(f"time={time.time()-st_time:.3f}s") edge_list = [] for index1, per in enumerate(indices): for index2 in per: index2 = int(index2) if index1 != index2: edge_list.append((query_ids[index1], center_ids[index2])) print(f"\tdone! .... time={time.time()-st_time:.3f}s") return edge_list