def init_lsh(self): """ Initializes locality-sensitive hashing with FALCONN to find nearest neighbors in training data. """ self.query_objects = { } # contains the object that can be queried to find nearest neighbors at each layer. # mean of training data representation per layer (that needs to be substracted before LSH). self.centers = {} for layer in self.layers: assert self.nb_tables >= self.neighbors # Normalize all the lenghts, since we care about the cosine similarity. self.train_activations_lsh[layer] /= np.linalg.norm( self.train_activations_lsh[layer], axis=1).reshape(-1, 1) # Center the dataset and the queries: this improves the performance of LSH quite a bit. center = np.mean(self.train_activations_lsh[layer], axis=0) self.train_activations_lsh[layer] -= center self.centers[layer] = center # LSH parameters params_cp = falconn.LSHConstructionParameters() params_cp.dimension = len(self.train_activations_lsh[layer][1]) params_cp.lsh_family = falconn.LSHFamily.CrossPolytope params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared params_cp.l = self.nb_tables params_cp.num_rotations = 2 # for dense set it to 1; for sparse data set it to 2 params_cp.seed = 5721840 # we want to use all the available threads to set up params_cp.num_setup_threads = 0 params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable # we build 18-bit hashes so that each table has # 2^18 bins; this is a good choice since 2^18 is of the same # order of magnitude as the number of data points falconn.compute_number_of_hash_functions(self.number_bits, params_cp) print('Constructing the LSH table') table = falconn.LSHIndex(params_cp) table.setup(self.train_activations_lsh[layer]) # Parse test feature vectors and find k nearest neighbors query_object = table.construct_query_object() query_object.set_num_probes(self.nb_tables) self.query_objects[layer] = query_object
def init_hash(): # 获得数组 train=np.array(load_all_beOne(path)) # 获取数组数量 trainNum=len(train) # 获得默认参数 p=falconn.get_default_parameters(trainNum, dim) t=falconn.LSHIndex(p) dataset=[np.ravel(x[0]).astype(np.float32) for x in train] dataset=np.array(dataset) # 生成hash logging.info('Start Hash setup') t.setup(dataset) if is_pool: q=t.construct_query_pool() else: q=t.construct_query_object() return (q, train)
def __init__(self, feature_file, label_file, id_feature_file, id_label_file): self.idfeature = np.load(id_feature_file) self.idlabel = np.load(id_label_file) self.label = np.load(label_file) print "start load feature data" t1 = time.time() feature = np.load(feature_file) t2 = time.time() print("load cost time:%f" % (t2 - t1)) dp = fc.get_default_parameters(feature.shape[0], feature.shape[1], fc.DistanceFunction.EuclideanSquared) ds = fc.LSHIndex(dp) train_st = time.time() ds.setup(feature) train_et = time.time() print("train cost time:%f" % (train_et - train_st)) self.qo = ds.construct_query_object()
def fit(self, X: np.ndarray, y: np.ndarray = None): """ Setup the LSH index from training data. Parameters ---------- X: np.array Data to be indexed y: any Ignored Returns ------- self: FalconnLSH An instance of LSH with a built index """ X = check_array(X, dtype=[np.float32, np.float64]) if self.metric in ['euclidean', 'l2', 'minkowski']: self.metric = 'euclidean' distance = falconn.DistanceFunction.EuclideanSquared elif self.metric in ['squared_euclidean', 'sqeuclidean']: self.metric = 'sqeuclidean' distance = falconn.DistanceFunction.EuclideanSquared elif self.metric in ['cosine', 'NegativeInnerProduct', 'neg_inner']: self.metric = 'cosine' distance = falconn.DistanceFunction.NegativeInnerProduct else: warnings.warn( f'Invalid metric "{self.metric}". Using "euclidean" instead') self.metric = 'euclidean' distance = falconn.DistanceFunction.EuclideanSquared # Set up the LSH index lsh_construction_params = falconn.get_default_parameters( *X.shape, distance=distance) lsh_index = falconn.LSHIndex(lsh_construction_params) lsh_index.setup(X) self.X_train_ = X self.y_train_ = y self.index_ = lsh_index return self
def load_identifier(self,labelFile,featuresFile): self.label = np.load( labelFile) print "start load feature data" print(labelFile) t1 = time.time() self.feature = np.load(featuresFile) self.embs = self.feature print ("feature dtype:%d", self.feature.dtype) t2 = time.time() print ("load cost time:%f" % (t2 - t1)) self.dp = fc.get_default_parameters(self.feature.shape[0], self.feature.shape[1], fc.DistanceFunction.EuclideanSquared) self.dp.l = 30 self.ds = fc.LSHIndex(self.dp) train_st = time.time() self.ds.setup(self.feature) train_et = time.time() print ("train cost time:%f" % (train_et - train_st)) self.qo = self.ds.construct_query_object()
def setup_lsh(X, num_probes=100): assert X.ndim == 2 params_cp = falconn.LSHConstructionParameters() params_cp.dimension = X.shape[1] params_cp.lsh_family = falconn.LSHFamily.CrossPolytope params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared params_cp.l = 100 params_cp.num_rotations = 1 params_cp.seed = 1234 params_cp.num_setup_threads = 0 params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable falconn.compute_number_of_hash_functions(16, params_cp) table = falconn.LSHIndex(params_cp) table.setup(X) query_object = table.construct_query_object() query_object.set_num_probes(num_probes) return query_object
def _create_bucket(segments): """ Creates a bucket of segments to use for LSH similarity lookup """ params_cp = falconn.LSHConstructionParameters() params_cp.dimension = len(segments[0]) params_cp.lsh_family = falconn.LSHFamily.CrossPolytope params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared params_cp.l = 25 params_cp.num_rotations = 2 params_cp.seed = 5721840 params_cp.num_setup_threads = 0 params_cp.storage_hash_table = ( falconn.StorageHashTable.BitPackedFlatHashTable) falconn.compute_number_of_hash_functions(18, params_cp) table = falconn.LSHIndex(params_cp) table.setup(segments) return (segments, table)
def test_lsh_index_positive(): n = 1000 d = 128 p = falconn.get_default_parameters(n, d) t = falconn.LSHIndex(p) dataset = np.random.randn(n, d).astype(np.float32) t.fit(dataset) u = np.random.randn(d).astype(np.float32) t.find_k_nearest_neighbors(u, 10) t.find_near_neighbors(u, 10.0) t.find_nearest_neighbor(u) t.get_candidates_with_duplicates(u) t.get_max_num_candidates() t.get_num_probes() t.get_query_statistics() t.get_unique_candidates(u) t.get_unique_sorted_candidates(u) t.reset_query_statistics() t.set_max_num_candidates(100) t.set_num_probes(10)
def lsh_sieve(full_deltas, d, n): deltas = np.reshape(full_deltas, (n, d)) centred_deltas = (deltas - np.mean(deltas, axis=0)) params = falconn.get_default_parameters(n, d) fln = falconn.LSHIndex(params) fln.setup(centred_deltas) qob = fln.construct_query_object() # Greedy merge within a distance # all_sets = list() full_grad = np.zeros(d) for i in range(n): neighbors = qob.find_near_neighbors(centred_deltas[i], 1.0 / d) # print str(i) + " has " + str(neighbors) full_grad = full_grad + (deltas[i] / len(neighbors)) return full_grad
def fit(self, X): if X.dtype != numpy.float32: X = X.astype(numpy.float32) if self._metric == 'angular': X /= numpy.linalg.norm(X, axis=1).reshape(-1, 1) self._center = numpy.mean(X, axis=0) X -= self._center import falconn self._params = falconn.LSHConstructionParameters() self._params.dimension = X.shape[1] self._params.distance_function = 'euclidean_squared' self._params.lsh_family = 'cross_polytope' falconn.compute_number_of_hash_functions(self._num_bits, self._params) self._params.l = self._num_tables self._params.num_rotations = 1 self._params.num_setup_threads = 0 self._params.storage_hash_table = 'flat_hash_table' self._params.seed = 95225714 self._index = falconn.LSHIndex(self._params) self._index.setup(X) self._index.set_num_probes(self._num_probes) self._buf = numpy.zeros((X.shape[1],), dtype=numpy.float32)
def setup_second_layer(self, number_of_tables=50): params_cp = falconn.LSHConstructionParameters() params_cp.dimension = self.X.shape[1] + 1 params_cp.lsh_family = falconn.LSHFamily.CrossPolytope params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared params_cp.l = number_of_tables params_cp.num_rotations = 1 params_cp.seed = 5721840 params_cp.num_setup_threads = 0 params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable falconn.compute_number_of_hash_functions(15, params_cp) print('Constructing the LSH table') t1 = timeit.default_timer() table = falconn.LSHIndex(params_cp) self.X_ = self.X_.astype('float') table.setup(self.X_) t2 = timeit.default_timer() print('Done') print('Construction time: {}'.format(t2 - t1)) self.query_object = table.construct_query_object()
def init_falconn(): dim = 2048 # 获得数组 my_feature = np.load( os.path.join(model_path, 'tensorflow-feature.npy')) print my_feature.shape my_class_name = np.load( os.path.join(model_path, 'tensorflow-class_name.npy')) print my_class_name.shape my_file_path = np.load( os.path.join(model_path, 'tensorflow-file_path.npy')) print my_file_path.shape # 获取数组数量 trainNum = len(my_feature) # 获得默认参数 p = falconn.get_default_parameters(trainNum, dim) t = falconn.LSHIndex(p) dataset = my_feature # 生成hash t.setup(dataset) q = t.construct_query_pool() return my_feature, my_class_name, my_file_path, q
def setup_lsh(): # extract the signature matrix from database con = psycopg2.connect("dbname=yinhan user=yinhan") cur = con.cursor() cur.execute("SELECT SIGNATURE FROM AKAFINGER") lst = cur.fetchall() con.commit() con.close() data = np.array([val[0] for val in lst]) center = np.mean(data, axis=0) data = data - center # use the center of the data base to center snippet # allegedly to improve the model performance params_cp = falconn.get_default_parameters(num_points=data.shape[0], dimension=data.shape[1]) table = falconn.LSHIndex(params_cp) table.setup(data) return center, table.construct_query_object()
def init_hash(): global my_arr, my_id, big_class # 获得数组 my_arr = np.load(os.path.join(path, 'array.npy')) my_id = np.load(os.path.join(path, 'id.npy')) f = open(os.path.join(path, 'big_class.txt'), 'r') a = f.read() big_class = eval(a) f.close() # 获取数组数量 trainNum = len(my_arr) # 获得默认参数 p = falconn.get_default_parameters(trainNum, dim) t = falconn.LSHIndex(p) dataset = my_arr # 生成hash logging.info('Start Hash setup') t.setup(dataset) if is_pool: q = t.construct_query_pool() else: q = t.construct_query_object() return q
def __falconn_fit(self): """ Initializes locality-sensitive hashing with FALCONN to find nearest neighbors in training data. """ import falconn dimension = self.features.shape[1] nb_tables = self.kwargs['nb_tables'] number_bits = self.kwargs['number_bits'] # LSH parameters params_cp = falconn.LSHConstructionParameters() params_cp.dimension = dimension params_cp.lsh_family = falconn.LSHFamily.CrossPolytope params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared params_cp.l = nb_tables params_cp.num_rotations = 2 # for dense set it to 1; for sparse data set it to 2 params_cp.seed = 5721840 # we want to use all the available threads to set up params_cp.num_setup_threads = 0 params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable # we build number_bits-bit hashes so that each table has # 2^number_bits bins; a rule of thumb is to have the number # of bins be the same order of magnitude as the number of data points falconn.compute_number_of_hash_functions(number_bits, params_cp) self._falconn_table = falconn.LSHIndex(params_cp) self._falconn_query_object = None self._FALCONN_NB_TABLES = nb_tables # Center the dataset and the queries: this improves the performance of LSH quite a bit. self.center = np.mean(self.features, axis=0) self.features -= self.center # add features to falconn table self._falconn_table.setup(self.features)
def build_lsh(self, all_signatures): """ take signatures of songs to build a LSH table, and the query object params: all_signatures: all signatures from the database returns: a falconn hash table; a pointer pointing to the falconn hash table None if not successful """ if all_signatures.shape[0] == 0: raise ValueError("All signatures must not be empty.") params = falconn.get_default_parameters(all_signatures.shape[0], all_signatures.shape[1]) # center the dataset to improve performance: all_signatures -= np.mean(all_signatures, axis=0) # Create the LSH table print('Constructing the LSH table...') table = falconn.LSHIndex(params) table.setup(all_signatures) print('Constructing the queries...') query_object = table.construct_query_object() self.table = table self.query_object = query_object if not table or not query_object: return None
def search(dataset, quer, number): params_cp = falconn.LSHConstructionParameters() params_cp.dimension = len(dataset) params_cp.lsh_family = falconn.LSHFamily.CrossPolytope params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared params_cp.l = 50 # we set one rotation, since the data is dense enough, # for sparse data set it to 2 params_cp.num_rotations = 1 params_cp.seed = 5721840 # we want to use all the available threads to set up params_cp.num_setup_threads = 0 params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable # we build 18-bit hashes so that each table has # 2^18 bins; this is a good choise since 2^18 is of the same # order of magnitude as the number of data points falconn.compute_number_of_hash_functions(18, params_cp) table = falconn.LSHIndex(params_cp) table.setup(dataset) query_object = table.construct_query_object() number_of_probes = 30000 query_object.set_num_probes(number_of_probes) result = query_object.find_k_nearest_neighbors(query, number) return result
def fit(self, X): if X.dtype != numpy.float32: X = X.astype(numpy.float32) if self._metric == 'hamming': # replace all zeroes by -1 X[X < 0.5] = -1 if self._metric == 'angular' or self._metric == 'hamming': X /= numpy.linalg.norm(X, axis=1).reshape(-1, 1) self._center = numpy.mean(X, axis=0) X -= self._center self._params = falconn.LSHConstructionParameters() self._params.dimension = X.shape[1] self._params.distance_function = falconn.DistanceFunction.EuclideanSquared self._params.lsh_family = falconn.LSHFamily.CrossPolytope falconn.compute_number_of_hash_functions(self._num_bits, self._params) self._params.l = self._num_tables self._params.num_rotations = 1 self._params.num_setup_threads = 0 self._params.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable self._params.seed = 95225714 self._index = falconn.LSHIndex(self._params) self._index.setup(X) self._query_object = self._index.construct_query_object() self._query_object.set_num_probes(self._num_probes)
# # Author : fcbruce <*****@*****.**> # # Time : Sat 06 May 2017 17:10:14 # # import numpy as np import falconn as fa a = np.random.randn(50000, 500) a /= np.linalg.norm(a, axis=1).reshape(-1, 1) print "pending..." params_cp = fa.LSHConstructionParameters() params_cp.dimension = 500 params_cp.lsh_family = 'cross_polytope' params_cp.distance_function = 'euclidean_squared' params_cp.l = 7 params_cp.num_rotations = 1 params_cp.seed = 11111 params_cp.num_setup_threads = 0 params_cp.storage_hash_table = 'bit_packed_flat_hash_table' fa.compute_number_of_hash_functions(18, params_cp) table = fa.LSHIndex(params_cp) table.setup(a) print "find" print table.find_nearest_neighbor(a[1])
def getLshIndex(para, dataset): nnModel = falconn.LSHIndex(para) nnModel.setup(dataset) print "## sim falconn data setup done. data", dataset.shape, time.asctime() return nnModel
print('Average query time: {} seconds'.format(average_scan_time)) print(sepline) # Hyperplane hashing params_hp = falconn.LSHConstructionParameters() params_hp.dimension = d params_hp.lsh_family = 'hyperplane' params_hp.distance_function = 'negative_inner_product' params_hp.k = 19 params_hp.l = 10 params_hp.seed = seed ^ 833840234 print('Hyperplane hash\n') start = timeit.default_timer() hp_table = falconn.LSHIndex(params_hp) hp_table.fit(data) hp_table.set_num_probes(2464) stop = timeit.default_timer() hp_construction_time = stop - start print('k = {}'.format(params_hp.k)) print('l = {}'.format(params_hp.l)) print('Number of probes = {}'.format(hp_table.get_num_probes())) print('Construction time: {} seconds\n'.format(hp_construction_time)) hp_avg_time, hp_success_prob = run_experiment(hp_table, queries, true_nns) del hp_table print(sepline) # Cross polytope hashing
def test_lsh_index_negative(): n = 1000 d = 128 p = falconn.get_default_parameters(n, d) t = falconn.LSHIndex(p) try: t.find_nearest_neighbor(np.random.randn(d)) assert False except RuntimeError: pass try: dataset = [[1.0, 2.0], [3.0, 4.0]] t.fit(dataset) assert False except TypeError: pass try: dataset = np.random.randn(n, d).astype(np.int32) t.fit(dataset) assert False except ValueError: pass try: dataset = np.random.randn(10, 10, 10) t.fit(dataset) assert False except ValueError: pass dataset = np.random.randn(n, d).astype(np.float32) t.fit(dataset) dataset = np.random.randn(n, d).astype(np.float64) t.fit(dataset) u = np.random.randn(d).astype(np.float64) try: t.find_k_nearest_neighbors(u, 0.5) assert False except TypeError: pass try: t.find_k_nearest_neighbors(u, -1) assert False except ValueError: pass try: t.find_near_neighbors(u, -1) assert False except ValueError: pass try: t.set_max_num_candidates(0.5) assert False except TypeError: pass try: t.set_max_num_candidates(-10) assert False except ValueError: pass t.set_num_probes(t._params.l) try: t.set_num_probes(t._params.l - 1) assert False except ValueError: pass try: t.set_num_probes(1000.1) assert False except TypeError: pass def check_check_query(f): try: f(u.astype(np.float32)) assert False except ValueError: pass try: f([0.0] * d) assert False except TypeError: pass try: f(u[:d - 1]) assert False except ValueError: pass try: f(np.random.randn(d, d)) assert False except ValueError: pass check_check_query(lambda u: t.find_k_nearest_neighbors(u, 10)) check_check_query(lambda u: t.find_near_neighbors(u, 0.5)) check_check_query(lambda u: t.find_nearest_neighbor(u)) check_check_query(lambda u: t.get_candidates_with_duplicates(u)) check_check_query(lambda u: t.get_unique_candidates(u)) check_check_query(lambda u: t.get_unique_sorted_candidates(u)) t.find_near_neighbors(u, 0.0)
def generate_candidate_threshold(entity_embedding=None, data_ids="OpenEA", path="", threshold=0.2, output_path=False, entity_file="ent_embeds,npy", normalize=True, metric="euclidean", lsh_family="hyperplane", number_of_tables=500): """ :param entity_embedding: :param data_ids: :param path: :param threshold: :param output_path: :param entity_file: :param normalize: :param metric: 1.inner 向量的内积, 2.euclidean 欧几里的距离(l2 normaliztion 后与cosine distance 成正比)。 :param lsh_family: :return: """ if entity_embedding is None: entity_file_path = path + entity_file entity_embedding = np.load(entity_file_path) print("Load [%s] successfully!" % (entity_file_path)) if data_ids is "OpenEA": ent2id1, id2ent1, max_id = read_ent_id(path + "kg1_ent_ids") ent2id2, id2ent2, max_id = read_ent_id(path + "kg2_ent_ids") paths = path.split('/') test_path = "/".join([paths[1], paths[2], paths[3], "datasets", paths[7], paths[8], paths[9]]) test_ids = [] with open('/' + test_path + r"/test_links", 'r', encoding='utf-8') as f: for line in f.readlines(): items = line.strip().split("\t") id1, id2 = int(ent2id1[items[0]]), int(ent2id2[items[1]]) # maxx_id = max(maxx_id, id1, id2) test_ids.append([id1, id2]) data_ids = test_ids if data_ids is "dbp15k": # train_ids = read_ids(path+"sup_ent_ids") test_ids = read_ids(path + "ref_ent_ids") # 只考虑测试集上匹配 # test_ids.extend(train_ids) data_ids = test_ids data_ids = np.array(data_ids).astype(int) entity_embedding = entity_embedding.astype(np.float32) if metric == "euclidean": entity_embedding -= np.mean(entity_embedding, axis=0) Lvec = np.array([entity_embedding[e] for e in data_ids[:, 0]]) Rvec = np.array([entity_embedding[e] for e in data_ids[:, 1]]) if os.path.exists(path + "mapping_mat.npy"): # OpenEA模型转换后的最终向量 mapping = np.load(path + "mapping_mat.npy") #print("mapping shape:", mapping.shape) Lvec = np.matmul(Lvec, mapping) #print("load mapping succussuflly!") if normalize: Lvec = preprocessing.normalize(Lvec, norm="l2", axis=1) Rvec = preprocessing.normalize(Rvec, norm="l2", axis=1) seed = 119417657 L_True = data_ids[:, 0].tolist() print("shape:", entity_embedding.shape) params_cp = falconn.LSHConstructionParameters() params_cp.dimension = entity_embedding.shape[1] if lsh_family == "crosspolytope": params_cp.lsh_family = falconn.LSHFamily.CrossPolytope elif lsh_family == "hyperplane": params_cp.lsh_family = falconn.LSHFamily.Hyperplane if metric == "euclidean": params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared elif metric == "inner": params_cp.distance_function = falconn.DistanceFunction.NegativeInnerProduct params_cp.l = number_of_tables params_cp.num_rotations = 1 params_cp.seed = seed # we want to use all the available threads to set up params_cp.num_setup_threads = 2 params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable # we build 18-bit hashes so that each table has # 2^18 bins; this is a good choise since 2^18 is of the same # order of magnitude as the number of data points falconn.compute_number_of_hash_functions(20, params_cp) # print('Constructing the LSH table') t1 = timeit.default_timer() table = falconn.LSHIndex(params_cp) table.setup(Lvec) t2 = timeit.default_timer() print('Construction time: {}'.format(t2 - t1)) query_object = table.construct_query_object() number_of_probes = number_of_tables print('Choosing number of probes: ', number_of_probes) query_object.set_num_probes(number_of_probes) t1 = timeit.default_timer() true_cnt = 0 total = 0 true_all = data_ids.shape[0] node_pairs = [] print("Metric:", metric, "Threshold:", threshold) for ids_index, pair in enumerate(data_ids): ans = query_object.find_near_neighbors(Rvec[ids_index], threshold=threshold) #print(len(ans)) for index in range(len(ans)): if pair[0] == L_True[ans[index]]: true_cnt += 1 node_pairs.append((pair[0], pair[1], 1)) else: node_pairs.append((L_True[ans[index]], pair[1], 0)) total += len(ans) print('Threshold:[%f] True cnt:[%d] Generate All cnt:[%d] Total:[%d] Recall:[%f] P/E ratio:[%f] Metric:[%s]' % (threshold, true_cnt, total, true_all, true_cnt/true_all, total/true_all, metric)) t2 = timeit.default_timer() print('Generate Candidate time: {}'.format(t2 - t1)) if output_path == True: output_path = "/".join(path.split('/')[:-1]) + '/topk_' + str(threshold) + '_name_ngram' print('output path:', output_path) with open(output_path, 'w', encoding='utf8') as f: for pair in node_pairs: f.writelines(pair[0] + '\t' + pair[1] + '\t' + str(pair[2]) + '\n')
import numpy as np import falconn if __name__ == '__main__': a1 = np.load('outputs_1.npy') a2 = np.load('outputs_2.npy') y = np.load('labels.npy') print(y.shape) a = np.r_[a1, a2] n, d = a.shape p = falconn.get_default_parameters(n, d) t = falconn.LSHIndex(p) dataset = a t.setup(dataset) Q = t.construct_query_object() # input i, k = 4545, 100 print(i, k) while (True): i, k = map(int, input().split()) q = a[i:i + 1, :] u = q.sum(axis=0) ans = Q.find_k_nearest_neighbors(u, k) print(ans)
import falconn # In[ ]: parameters = falconn.LSHConstructionParameters() num_tables = 1 parameters.l = num_tables parameters.dimension = num_dimensions parameters.distance_function = falconn.DistanceFunction.EuclideanSquared parameters.lsh_family = falconn.LSHFamily.CrossPolytope parameters.num_rotations = 1 parameters.num_setup_threads = 1 parameters.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable falconn.compute_number_of_hash_functions(16, parameters) index = falconn.LSHIndex(parameters) get_ipython().run_line_magic('time', 'index.setup(dataset)') query_object = index.construct_query_object() num_probes = 1 query_object.set_num_probes(num_probes) get_ipython().run_line_magic( 'timeit', 'query_object.find_k_nearest_neighbors(query, 5)') # In[ ]: query = dataset[5000] print(query_object.find_k_nearest_neighbors(query, 5))
def set_clustering_LSH_Index(self, number_of_queries, query_accuracy, number_of_tables, hash_bit): #Function defintion: Returns the LSH Index -- Read LSH for more information or # README.2 #parameters #number_of_queries:The number of queries used to determine the number_of_probes #query_accuracy: Specifies the level of accuracy of the Index #Setting query_accuracy = 1 degenerates LSH index into linear search. #number_of_tables:the number of hash_tables used for a given nearest #neighbor search #hash_bit: Used to determine the number of hash functions. READ_ME for detail. print("Setting Clustering Index") queries = self.w2v_vectors[(len(self.w2v_vectors)-number_of_queries):] w2v_vectors = self.w2v_vectors[:(len(self.w2v_vectors)-number_of_queries)] #Normalize vectors center = np.mean(w2v_vectors, axis=0) w2v_vectors -= center queries -= center #perform linear scan to return correct answers answers = self.linearScan_answerGenerator(w2v_vectors, queries) #Set number of probes---- print('Choosing number of probes') init_number_of_probes = 600 # END ------- #Parameters ----- params_cp = falconn.LSHConstructionParameters() params_cp.dimension = len(w2v_vectors[0]) # = 50 for Glove6B.50d params_cp.lsh_family = falconn.LSHFamily.CrossPolytope params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared params_cp.l = number_of_tables params_cp.num_rotations = 1 params_cp.seed = 5721840 params_cp.num_setup_threads = 0 params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable falconn.compute_number_of_hash_functions(hash_bit, params_cp) # END ------ #Constructing LSH Index ----- print('Constructing the LSH Index') t1 = timeit.default_timer() table = falconn.LSHIndex(params_cp) table.setup(w2v_vectors) t2 = timeit.default_timer() print('Done') print('Construction time: {}'.format(t2 - t1)) query_object = table.construct_query_object() number_of_probes = self.probeGenerator(query_accuracy, init_number_of_probes, query_object, answers, queries, number_of_tables) query_object.set_num_probes(number_of_probes) #-------- # Performance Statistics t1 = timeit.default_timer() score = 0 for (i, query) in enumerate(queries): if query_object.find_nearest_neighbor(query) == answers[i]: score += 1 t2 = timeit.default_timer() print('Query time: {}'.format((t2 - t1) / len(queries))) print('Precision: {}'.format(float(score) / len(queries))) self.query_object = query_object print("Vectors Successfully Hashed. Clustering LSH Index Created")
params_cp.l = number_of_tables # we set one rotation, since the data is dense enough, # for sparse data set it to 2 params_cp.num_rotations = 1 params_cp.seed = 5721840 # we want to use all the available threads to set up params_cp.num_setup_threads = 0 params_cp.storage_hash_table = 'bit_packed_flat_hash_table' # we build 18-bit hashes so that each table has # 2^18 bins; this is a good choise since 2^18 is of the same # order of magnitude as the number of data points falconn.compute_number_of_hash_functions(18, params_cp) print('Constructing the LSH table') t1 = timeit.default_timer() table = falconn.LSHIndex(params_cp) table.setup(dataset) t2 = timeit.default_timer() print('Done') print('Construction time: {}'.format(t2 - t1)) # find the smallest number of probes to achieve accuracy 0.9 # using the binary search print('Choosing number of probes') number_of_probes = number_of_tables def evaluate_number_of_probes(number_of_probes): table.set_num_probes(number_of_probes) score = 0 for (i, query) in enumerate(queries): if answers[i] in table.get_candidates_with_duplicates(query): score += 1
def test_lsh_index_negative(): p = falconn.get_default_parameters(n, d) try: t = falconn.LSHIndex(p) t.construct_query_object() assert False except RuntimeError: pass try: t = falconn.LSHIndex(p) t.setup([[1.0, 2.0], [3.0, 4.0]]) assert False except TypeError: pass try: t = falconn.LSHIndex(p) t.setup(np.random.randn(n, d).astype(np.int32)) assert False except TypeError: pass try: t = falconn.LSHIndex(p) t.setup(np.random.randn(10, 10, 10)) assert False except ValueError: pass try: t = falconn.LSHIndex(p) t.setup(np.random.randn(n, d)) t.setup(np.random.randn(n, d)) assert False except RuntimeError: pass for (t1, t2) in [(np.float32, np.float64), (np.float64, np.float32)]: for g in [ lambda t: t.construct_query_object(), lambda t: t.construct_query_pool() ]: t = falconn.LSHIndex(p) t.setup(np.random.randn(n, d).astype(t1)) q = g(t) u = np.random.randn(d).astype(t1) try: q.find_k_nearest_neighbors(u, 0.5) assert False except TypeError: pass try: q.find_k_nearest_neighbors(u, -1) assert False except ValueError: pass try: q.find_near_neighbors(u, -1) assert False except ValueError: pass try: q.set_max_num_candidates(0.5) assert False except TypeError: pass try: q.set_max_num_candidates(-10) assert False except ValueError: pass q.set_num_probes(t._params.l) try: q.set_num_probes(t._params.l - 1) assert False except ValueError: pass try: q.set_num_probes(1000.1) assert False except TypeError: pass def check_check_query(f): try: f(u.astype(t2)) assert False except TypeError: pass try: f([0.0] * d) assert False except TypeError: pass try: f(u[:d - 1]) assert False except ValueError: pass try: f(np.random.randn(d, d)) assert False except ValueError: pass check_check_query(lambda u: q.find_k_nearest_neighbors(u, 10)) check_check_query(lambda u: q.find_near_neighbors(u, 0.5)) check_check_query(lambda u: q.find_nearest_neighbor(u)) check_check_query(lambda u: q.get_candidates_with_duplicates(u)) check_check_query(lambda u: q.get_unique_candidates(u))
import falconn par = falconn.LSHConstructionParameters() param = falconn.get_default_parameters(num_points = len(train), dimension = len(train[0]), distance = falconn.DistanceFunction.EuclideanSquared ) print(param.lsh_family, param.l, param.k) tables = param.l hashes = param.k param.l = int(1.1*tables) para = [] for k in [hashes,int(hashes*1.5)]: param.k = k lsh = falconn.LSHIndex(param) lsh.setup(train) startClock = time.clock() startTime = process_time() indexlsh = lsh.construct_query_object() end_time = process_time() constructionTime = end_time - startTime endClock = time.clock() constructionClock= endClock - startClock for t in [param.l, int(param.l*2), int(param.l*3)]: indexlsh.set_num_probes(t) print('lsh-l'+str(param.l)+'k'+str(param.k)+'t'+str(t)) rez = [] for q in qry: startClock = time.clock() startTime = process_time()
# params_cp_blue = falconn.LSHConstructionParameters() params_cp_blue.dimension = len(dataset_blue[0]) params_cp_blue.lsh_family = falconn.LSHFamily.CrossPolytope params_cp_blue.distance_function = falconn.DistanceFunction.EuclideanSquared params_cp_blue.l = number_of_tables params_cp_blue.num_rotations = 1 params_cp_blue.seed = 666666 params_cp_blue.num_setup_threads = 1 params_cp_blue.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable falconn.compute_number_of_hash_functions(20, params_cp_blue) print('Constructing the LSH table') t1 = timeit.default_timer() table_blue = falconn.LSHIndex(params_cp_blue) table_blue.setup(dataset_blue) t2 = timeit.default_timer() query_object_blue = table_blue.construct_query_object() print('Done') print('Construction time: {}'.format((t2 - t1))) params_cp_green = falconn.LSHConstructionParameters() params_cp_green.dimension = len(dataset_green[0]) params_cp_green.lsh_family = falconn.LSHFamily.CrossPolytope params_cp_green.distance_function = falconn.DistanceFunction.EuclideanSquared params_cp_green.l = number_of_tables params_cp_green.num_rotations = 1 params_cp_green.seed = 666666 params_cp_green.num_setup_threads = 1 params_cp_green.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable