def make_tables(dataset, num_queries=1000, num_tables=50, copy=True, seed=None, num_threads=0, verbose=True, already_normed=False): p = partial(print, file=sys.stderr) if verbose else lambda *a, **kw: None if already_normed: if copy: dataset = dataset.copy() else: norms = np.linalg.norm(dataset, axis=1) if copy: dataset = dataset / norms[:, np.newaxis] else: dataset /= norms[:, np.newaxis] normed_mean = dataset.mean(axis=0) dataset -= normed_mean params_cp = falconn.LSHConstructionParameters() params_cp.dimension = dataset.shape[1] params_cp.lsh_family = 'cross_polytope' params_cp.distance_function = 'euclidean_squared' params_cp.l = num_tables params_cp.num_rotations = 1 # try 2, maybe params_cp.seed = seed if seed is not None else np.random.randint(2**31) params_cp.num_setup_threads = num_threads params_cp.storage_hash_table = 'bit_packed_flat_hash_table' n_bits = int(np.round(np.log2(dataset.shape[0]))) falconn.compute_number_of_hash_functions(n_bits, params_cp) p('Starting building table...', end='') table = falconn.LSHIndex(params_cp) table.setup(dataset) p('done') return table, normed_mean
def LSHtable(self, file, euclidean=True, number_of_tables=50, hash_fx=18): """ input: 2-D numpy array output: LSH table Params: :file: 2-D numpy array of document vectors :distance_function: [EuclideanSquared, NegativeInnerProduct] :number_of_tables: (default=50) :num_of_rotations: 1 :seed: 5721840 :num_setup_threads: 0 :hash_fx: 18 (2^18 hash tables) """ dataset = file params_cp = falconn.LSHConstructionParameters() params_cp.dimension = len(dataset[0]) params_cp.lsh_family = falconn.LSHFamily.CrossPolytope if euclidean == True: params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared else: params_cp.distance_function = falconn.DistanceFunction.NegativeInnerProduct params_cp.l = number_of_tables params_cp.num_rotations = 1 params_cp.seed = 5721840 params_cp.num_setup_threads = 0 params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable falconn.compute_number_of_hash_functions(hash_fx, params_cp) # Construct the LSH table LSHtable = falconn.LSHIndex(params_cp) LSHtable.setup(dataset) return LSHtable
def set_cp(data): """ d = 128 seed = 119417657 # Cross polytope hashing params_cp = falconn.LSHConstructionParameters() params_cp.dimension = d params_cp.lsh_family = falconn.LSHFamily.CrossPolytope params_cp.distance_function = falconn.DistanceFunction.NegativeInnerProduct params_cp.storage_hash_table = falconn.StorageHashTable.FlatHashTable params_cp.k = 3 params_cp.l = 10 params_cp.num_setup_threads = 0 params_cp.last_cp_dimension = 16 params_cp.num_rotations = 3 params_cp.seed = seed ^ 833840234 """ num_points, dim = data.shape parms = falconn.get_default_parameters(num_points, dim) falconn.compute_number_of_hash_functions(7, parms) cp_table = falconn.LSHIndex(parms) cp_table.setup(data) qo = cp_table.construct_query_object() qo.set_num_probes(896) return qo
def retrival(self, query, dataset=None, *, k=None, threshold=None): if dataset is None: table = self.last_table else: hashint = xxhash.xxh64(dataset[:, 0].copy(), self.seed).intdigest() if hashint in self.tables: table = self.tables[hashint] else: print('find a new dataset') dataset = dataset.astype(np.float32) mean = np.mean(dataset, axis=0) dataset -= mean params = falconn.get_default_parameters( dataset.shape[0], dataset.shape[1]) falconn.compute_number_of_hash_functions(7, params) lsh_index = falconn.LSHIndex(params) lsh_index.setup(dataset) qtable = lsh_index.construct_query_object() qtable.set_num_probes(10000) table = (mean, qtable) self.tables[hashint] = table if table is None: raise Exception("Dataset not specific") query -= table[0] if k is not None and threshold is not None: raise ValueError("k and threshold should not pass simultaneously") self.last_table = table if k is not None: return table[1].find_k_nearest_neighbors(query, k) if threshold is not None: return table[1].find_near_neighbors(query, threshold) return table[1].find_nearest_neighbor(query)
def search(query,number): dataset = np.load("/Users/liupengcheng/Downloads/final_data.npy") params_cp = falconn.LSHConstructionParameters() params_cp.dimension = len(dataset[0]) params_cp.lsh_family = falconn.LSHFamily.CrossPolytope params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared params_cp.l = 50 # we set one rotation, since the data is dense enough, # for sparse data set it to 2 params_cp.num_rotations = 1 params_cp.seed = 5721840 # we want to use all the available threads to set up params_cp.num_setup_threads = 0 params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable # we build 18-bit hashes so that each table has # 2^18 bins; this is a good choise since 2^18 is of the same # order of magnitude as the number of data points falconn.compute_number_of_hash_functions(18, params_cp) table = falconn.LSHIndex(params_cp) table.setup(dataset) query_object = table.construct_query_object() number_of_probes = 3816 query_object.set_num_probes(number_of_probes) result = query_object.find_k_nearest_neighbors(query,number) return result
def test_number_of_hash_functions(): params = falconn._internal.LSHConstructionParameters() params.lsh_family = 'hyperplane' params.dimension = 10 falconn.compute_number_of_hash_functions(5, params) assert params.k == 5 params.lsh_family = 'cross_polytope' falconn.compute_number_of_hash_functions(5, params) assert params.k == 1 assert params.last_cp_dimension == 16 params.dimension = 100 params.lsh_family = 'hyperplane' falconn.compute_number_of_hash_functions(8, params) assert params.k == 8 params.lsh_family = 'cross_polytope' falconn.compute_number_of_hash_functions(8, params) assert params.k == 1 assert params.last_cp_dimension == 128 falconn.compute_number_of_hash_functions(10, params) assert params.k == 2 assert params.last_cp_dimension == 2
def __init__(self, dataset): number_of_queries = 10 # we build only 50 tables, increasing this quantity will improve the query time # at a cost of slower preprocessing and larger memory footprint, feel free to # play with this number number_of_tables = 50 params_cp = falconn.LSHConstructionParameters() params_cp.dimension = len(dataset[0]) params_cp.lsh_family = falconn.LSHFamily.CrossPolytope params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared params_cp.l = number_of_tables # we set one rotation, since the data is dense enough, # for sparse data set it to 2 params_cp.num_rotations = 1 params_cp.seed = 5721840 # we want to use all the available threads to set up params_cp.num_setup_threads = 0 params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable self.params_cp = params_cp # we build 18-bit hashes so that each table has # 2^18 bins; this is a good choise since 2^18 is of the same # order of magnitude as the number of data points falconn.compute_number_of_hash_functions(18, params_cp) print('Constructing the LSH table') self.table = falconn.LSHIndex(params_cp) self.table.setup(dataset) self.data = dataset self.query_object = self.table.construct_query_object()
def make_tables(dataset, num_queries=1000, num_tables=50, copy=True, seed=None, num_threads=0, verbose=True): p = partial(print, file=sys.stderr) if verbose else lambda *a, **kw: None norms = np.linalg.norm(dataset, axis=1) if copy: dataset = dataset / norms[:, np.newaxis] else: dataset /= norms[:, np.newaxis] normed_mean = dataset.mean(axis=0) dataset -= normed_mean params_cp = falconn.LSHConstructionParameters() params_cp.dimension = dataset.shape[1] params_cp.lsh_family = 'cross_polytope' params_cp.distance_function = 'euclidean_squared' params_cp.l = num_tables params_cp.num_rotations = 1 # try 2, maybe params_cp.seed = seed if seed is not None else np.random.randint(2**31) params_cp.num_setup_threads = num_threads params_cp.storage_hash_table = 'bit_packed_flat_hash_table' n_bits = int(np.round(np.log2(dataset.shape[0]))) falconn.compute_number_of_hash_functions(n_bits, params_cp) p('Starting building table...', end='') table = falconn.LSHIndex(params_cp) table.setup(dataset) p('done') return table, normed_mean
def _init_falconn( self, dimension, number_bits, nb_tables, ): import falconn assert nb_tables >= self._NEIGHBORS # LSH parameters params_cp = falconn.LSHConstructionParameters() params_cp.dimension = dimension params_cp.lsh_family = falconn.LSHFamily.CrossPolytope params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared params_cp.l = nb_tables params_cp.num_rotations = 2 # for dense set it to 1; for sparse data set it to 2 params_cp.seed = 5721840 # we want to use all the available threads to set up params_cp.num_setup_threads = 0 params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable # we build number_bits-bit hashes so that each table has # 2^number_bits bins; a rule of thumb is to have the number # of bins be the same order of magnitude as the number of data points falconn.compute_number_of_hash_functions(number_bits, params_cp) self._falconn_table = falconn.LSHIndex(params_cp) self._falconn_query_object = None self._FALCONN_NB_TABLES = nb_tables
def __set_hierarchical_LSH_Index(self, cluster, number_of_tables, hash_bit): #Function defintion: Returns the LSH Index for hierarchical clustering # -- Read LSH for more information or README.2 #params --- #cluster: the set of vectors wished to be clustered. # number_of_tables: the number of tables used in each nearest neighbor search (see LSH section line 114) #hash_bit: Used to determine the strength of the hash_function see README.2 or LSH for more detail params_cp = falconn.LSHConstructionParameters() params_cp.dimension = len(cluster[0]) params_cp.lsh_family = falconn.LSHFamily.CrossPolytope params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared params_cp.l = number_of_tables params_cp.num_rotations = 1 #Parameter associated with crosspolytope see Falconnn for more params_cp.seed = 5721840 # we want to use all the available threads to set up params_cp.num_setup_threads = 0 params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable hash_bit = math.floor(math.log(len(cluster),2)) # we build 32-bit hashes so that each table has # 2^32 bins; this is a good choise since 2^32 is of the same # order of magnitude as the number of data points falconn.compute_number_of_hash_functions(hash_bit, params_cp) #Look at typical number of hash functions #Figure out how number of hashfunctions are determined. print('Constructing the LSH Index For Cluster Combine Method.') t1 = timeit.default_timer() table = falconn.LSHIndex(params_cp) table.setup(cluster) t2 = timeit.default_timer() print('Done') print('Construction time: {}'.format(t2 - t1)) self.hierarchical_LSH_Index = table.construct_query_object()
def falconn_table(sig_mat): ''' Construct a falconn table with given signature. Return a falconn table and the random seed used (for random rotation) to construct the falconn table. Keyword Argument: sig_mat -- A numpy ndarray, where each row is signature at a time window center ''' # pre-processing the signature matrix # coerce the ndarray into 32-bit floating number if sig_mat.dtype != np.float32: sig_mat = sig_mat.astype(np.float32) # Normalize and center the signature matrix so that # the observations are on a unit hypersphere sig_mat /= max(1e-6, max(np.linalg.norm(sig_mat, axis=1).reshape(-1, 1))) center = np.mean(sig_mat, axis=0) sig_mat -= center # Instantiate the parameters for the falconn table params_cp = falconn.LSHConstructionParameters() params_cp.dimension = len(sig_mat[0]) # Set the LSH family to be Cross Polytope params_cp.lsh_family = falconn.LSHFamily.CrossPolytope # Set the distance function to be the L2_norm # which is the cosine distance params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared # # Set the randomly-picked seed for table construction # params_cp.seed = cp_seed # Set the number of random rotation, since the signature is very likely # a large sparse matrix params_cp.num_rotations = 2 # select the number of hash tables params_cp.l = 50 params_cp.seed = 5721840 # Set the thread usage (0 for using all) and storage formats of the # falconn table params_cp.num_setup_threads = 0 params_cp.storage_hash_table = ( falconn.StorageHashTable.BitPackedFlatHashTable) # select the number of hash functions according the size # of the signature matrix num_obs = sig_mat.shape[0] bit_num = int(np.log2(num_obs)) falconn.compute_number_of_hash_functions(bit_num, params_cp) # Construct falconn table with configured parameters falconn_tab = falconn.LSHIndex(params_cp) falconn_tab.setup(sig_mat) return falconn_tab
def add_to_data(self, point): """Return None Add a new point to the dataset """ falconn.compute_number_of_hash_functions(18, self.params_cp) print('Constructing the LSH table') self.table = falconn.LSHIndex(self.params_cp ) self.data = np.vstack([self.data, point]) self.table.setup(self.data)
def add(self, vecs): self.center = np.mean(vecs, axis=0) # Subtract mean vector later self.params_cp = falconn.get_default_parameters( num_points=vecs.shape[0], dimension=vecs.shape[1], distance=falconn.DistanceFunction.EuclideanSquared, is_sufficiently_dense=True) # self.params_cp.num_setup_threads = 0 # Single thread mode bit = int(np.round(np.log2(vecs.shape[0]))) falconn.compute_number_of_hash_functions(bit, self.params_cp) self.table = falconn.LSHIndex(self.params_cp) self.table.setup(vecs - self.center) self.query_object = self.table.construct_query_object()
def build_LSH_index(self): params_cp = falconn.LSHConstructionParameters() params_cp.dimension = self.vectorized_articles.shape[1] params_cp.lsh_family = falconn.LSHFamily.CrossPolytope params_cp.distance_function = falconn.DistanceFunction.NegativeInnerProduct params_cp.l = 200 params_cp.num_rotations = 1 params_cp.num_setup_threads = 0 params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable falconn.compute_number_of_hash_functions(21, params_cp) self.table = falconn.LSHIndex(params_cp) self.table.setup(self.vectorized_articles) self.query = self.table.construct_query_object() self.query.set_num_probes(params_cp.l)
def init_lsh(self): """ Initializes locality-sensitive hashing with FALCONN to find nearest neighbors in training data. """ self.query_objects = { } # contains the object that can be queried to find nearest neighbors at each layer. # mean of training data representation per layer (that needs to be substracted before LSH). self.centers = {} for layer in self.layers: assert self.nb_tables >= self.neighbors # Normalize all the lenghts, since we care about the cosine similarity. self.train_activations_lsh[layer] /= np.linalg.norm( self.train_activations_lsh[layer], axis=1).reshape(-1, 1) # Center the dataset and the queries: this improves the performance of LSH quite a bit. center = np.mean(self.train_activations_lsh[layer], axis=0) self.train_activations_lsh[layer] -= center self.centers[layer] = center # LSH parameters params_cp = falconn.LSHConstructionParameters() params_cp.dimension = len(self.train_activations_lsh[layer][1]) params_cp.lsh_family = falconn.LSHFamily.CrossPolytope params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared params_cp.l = self.nb_tables params_cp.num_rotations = 2 # for dense set it to 1; for sparse data set it to 2 params_cp.seed = 5721840 # we want to use all the available threads to set up params_cp.num_setup_threads = 0 params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable # we build 18-bit hashes so that each table has # 2^18 bins; this is a good choice since 2^18 is of the same # order of magnitude as the number of data points falconn.compute_number_of_hash_functions(self.number_bits, params_cp) print('Constructing the LSH table') table = falconn.LSHIndex(params_cp) table.setup(self.train_activations_lsh[layer]) # Parse test feature vectors and find k nearest neighbors query_object = table.construct_query_object() query_object.set_num_probes(self.nb_tables) self.query_objects[layer] = query_object
def setup_lsh(X, num_probes=100): assert X.ndim == 2 params_cp = falconn.LSHConstructionParameters() params_cp.dimension = X.shape[1] params_cp.lsh_family = falconn.LSHFamily.CrossPolytope params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared params_cp.l = 100 params_cp.num_rotations = 1 params_cp.seed = 1234 params_cp.num_setup_threads = 0 params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable falconn.compute_number_of_hash_functions(16, params_cp) table = falconn.LSHIndex(params_cp) table.setup(X) query_object = table.construct_query_object() query_object.set_num_probes(num_probes) return query_object
def _create_bucket(segments): """ Creates a bucket of segments to use for LSH similarity lookup """ params_cp = falconn.LSHConstructionParameters() params_cp.dimension = len(segments[0]) params_cp.lsh_family = falconn.LSHFamily.CrossPolytope params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared params_cp.l = 25 params_cp.num_rotations = 2 params_cp.seed = 5721840 params_cp.num_setup_threads = 0 params_cp.storage_hash_table = ( falconn.StorageHashTable.BitPackedFlatHashTable) falconn.compute_number_of_hash_functions(18, params_cp) table = falconn.LSHIndex(params_cp) table.setup(segments) return (segments, table)
def fit(self, X): if X.dtype != numpy.float32: X = X.astype(numpy.float32) if self._metric == 'angular': X /= numpy.linalg.norm(X, axis=1).reshape(-1, 1) self._center = numpy.mean(X, axis=0) X -= self._center import falconn self._params = falconn.LSHConstructionParameters() self._params.dimension = X.shape[1] self._params.distance_function = 'euclidean_squared' self._params.lsh_family = 'cross_polytope' falconn.compute_number_of_hash_functions(self._num_bits, self._params) self._params.l = self._num_tables self._params.num_rotations = 1 self._params.num_setup_threads = 0 self._params.storage_hash_table = 'flat_hash_table' self._params.seed = 95225714 self._index = falconn.LSHIndex(self._params) self._index.setup(X) self._index.set_num_probes(self._num_probes) self._buf = numpy.zeros((X.shape[1],), dtype=numpy.float32)
def setup_second_layer(self, number_of_tables=50): params_cp = falconn.LSHConstructionParameters() params_cp.dimension = self.X.shape[1] + 1 params_cp.lsh_family = falconn.LSHFamily.CrossPolytope params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared params_cp.l = number_of_tables params_cp.num_rotations = 1 params_cp.seed = 5721840 params_cp.num_setup_threads = 0 params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable falconn.compute_number_of_hash_functions(15, params_cp) print('Constructing the LSH table') t1 = timeit.default_timer() table = falconn.LSHIndex(params_cp) self.X_ = self.X_.astype('float') table.setup(self.X_) t2 = timeit.default_timer() print('Done') print('Construction time: {}'.format(t2 - t1)) self.query_object = table.construct_query_object()
def __falconn_fit(self): """ Initializes locality-sensitive hashing with FALCONN to find nearest neighbors in training data. """ import falconn dimension = self.features.shape[1] nb_tables = self.kwargs['nb_tables'] number_bits = self.kwargs['number_bits'] # LSH parameters params_cp = falconn.LSHConstructionParameters() params_cp.dimension = dimension params_cp.lsh_family = falconn.LSHFamily.CrossPolytope params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared params_cp.l = nb_tables params_cp.num_rotations = 2 # for dense set it to 1; for sparse data set it to 2 params_cp.seed = 5721840 # we want to use all the available threads to set up params_cp.num_setup_threads = 0 params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable # we build number_bits-bit hashes so that each table has # 2^number_bits bins; a rule of thumb is to have the number # of bins be the same order of magnitude as the number of data points falconn.compute_number_of_hash_functions(number_bits, params_cp) self._falconn_table = falconn.LSHIndex(params_cp) self._falconn_query_object = None self._FALCONN_NB_TABLES = nb_tables # Center the dataset and the queries: this improves the performance of LSH quite a bit. self.center = np.mean(self.features, axis=0) self.features -= self.center # add features to falconn table self._falconn_table.setup(self.features)
def fit(self, X): if X.dtype != numpy.float32: X = X.astype(numpy.float32) if self._metric == 'hamming': # replace all zeroes by -1 X[X < 0.5] = -1 if self._metric == 'angular' or self._metric == 'hamming': X /= numpy.linalg.norm(X, axis=1).reshape(-1, 1) self._center = numpy.mean(X, axis=0) X -= self._center self._params = falconn.LSHConstructionParameters() self._params.dimension = X.shape[1] self._params.distance_function = falconn.DistanceFunction.EuclideanSquared self._params.lsh_family = falconn.LSHFamily.CrossPolytope falconn.compute_number_of_hash_functions(self._num_bits, self._params) self._params.l = self._num_tables self._params.num_rotations = 1 self._params.num_setup_threads = 0 self._params.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable self._params.seed = 95225714 self._index = falconn.LSHIndex(self._params) self._index.setup(X) self._query_object = self._index.construct_query_object() self._query_object.set_num_probes(self._num_probes)
params_cp = falconn.LSHConstructionParameters() params_cp.dimension = len(dataset[0]) params_cp.lsh_family = 'cross_polytope' params_cp.distance_function = 'euclidean_squared' params_cp.l = number_of_tables # we set one rotation, since the data is dense enough, # for sparse data set it to 2 params_cp.num_rotations = 1 params_cp.seed = 5721840 # we want to use all the available threads to set up params_cp.num_setup_threads = 0 params_cp.storage_hash_table = 'bit_packed_flat_hash_table' # we build 18-bit hashes so that each table has # 2^18 bins; this is a good choise since 2^18 is of the same # order of magnitude as the number of data points falconn.compute_number_of_hash_functions(18, params_cp) print('Constructing the LSH table') t1 = timeit.default_timer() table = falconn.LSHIndex(params_cp) table.setup(dataset) t2 = timeit.default_timer() print('Done') print('Construction time: {}'.format(t2 - t1)) # find the smallest number of probes to achieve accuracy 0.9 # using the binary search print('Choosing number of probes') number_of_probes = number_of_tables def evaluate_number_of_probes(number_of_probes): table.set_num_probes(number_of_probes)
def lsh_for_ccd(dataset: np.array, queries: list, methoddict: dict, lastIndexBefore: int): number_of_tables = 10 # queries = dataset params_cp = falconn.LSHConstructionParameters() params_cp.dimension = len(dataset[0]) params_cp.lsh_family = falconn.LSHFamily.CrossPolytope params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared params_cp.l = number_of_tables params_cp.num_rotations = 1 params_cp.seed = 5721840 params_cp.num_setup_threads = 0 params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable falconn.compute_number_of_hash_functions(18, params_cp) print('Constructing the LSH table') t1 = timeit.default_timer() table = falconn.LSHIndex(params_cp) table.setup(dataset) t2 = timeit.default_timer() print('Done') print('Construction time: {}'.format(t2 - t1)) query_object = table.construct_query_object() methodfilterset = set() currentIter = lastIndexBefore totalIter = len(dataset) for query in queries: neighbors = query_object.find_near_neighbors(query, threshold=endTheta) for neighbor in neighbors: queryMdKey = hashlib.md5(str(query.tolist()).encode()).hexdigest() neighborMdKey = hashlib.md5( str(dataset[neighbor].tolist()).encode()).hexdigest() # 13ccdCodeLineSeparate112443321234ccdTokenSeparate/home/xxx/xx.java,3,15ccdFileKeySeparate/home/xxx/xx.java,31,45 left = str(methoddict[queryMdKey]) right = str(methoddict[neighborMdKey]) ccdLeft = left.split("ccdCodeLineSeparate") ccdRight = right.split("ccdCodeLineSeparate") methodsLeftLine = ccdLeft[0] methodsRightLine = ccdRight[0] ccdTokenLeft = ccdLeft[1].split('ccdTokenSeparate') ccdTokenRight = ccdRight[1].split('ccdTokenSeparate') methodsLeft = ccdTokenLeft[1] methodsRight = ccdTokenRight[1] methodsLeftToken = ccdTokenLeft[0] methodsRightToken = ccdTokenRight[0] if queryMdKey == neighborMdKey: tmpStr = methodsLeft if "ccdFileKeySeparate" in tmpStr: tmpArr = tmpStr.split("ccdFileKeySeparate") if len(tmpArr) == 2: result = getCloneTuple(tmpArr[0] + "," + tmpArr[1]) writer.writerow(result) else: for i in range(0, len(tmpArr)): for j in range(i + 1, len(tmpArr)): result = getCloneTuple(tmpArr[i] + "," + tmpArr[j]) writer.writerow(result) continue if neighbor > currentIter: if not lineFilter(int(methodsLeftLine), int(methodsRightLine)): dist = np.linalg.norm(query - dataset[neighbor]) dist *= dist if dist <= optTheta: getCloneResult(methodsLeft, methodsRight) else: beta = betaMain(methodsLeftToken, methodsRightToken) if beta <= minbeta: continue dist = getOptDist(beta, dist) if dist < cloneTheta: getCloneResult(methodsLeft, methodsRight) currentIter = currentIter + 1 # print("%d / %d \r" % (currentIter, totalIter)) print(time.time())
def __init__(self, dataset, params, num_bits=16): fa.compute_number_of_hash_functions(num_bits, params) self._table = fa.LSHIndex(params) self._table.setup(dataset)
def generate_candidate_threshold(entity_embedding=None, data_ids="OpenEA", path="", threshold=0.2, output_path=False, entity_file="ent_embeds,npy", normalize=True, metric="euclidean", lsh_family="hyperplane", number_of_tables=500): """ :param entity_embedding: :param data_ids: :param path: :param threshold: :param output_path: :param entity_file: :param normalize: :param metric: 1.inner 向量的内积, 2.euclidean 欧几里的距离(l2 normaliztion 后与cosine distance 成正比)。 :param lsh_family: :return: """ if entity_embedding is None: entity_file_path = path + entity_file entity_embedding = np.load(entity_file_path) print("Load [%s] successfully!" % (entity_file_path)) if data_ids is "OpenEA": ent2id1, id2ent1, max_id = read_ent_id(path + "kg1_ent_ids") ent2id2, id2ent2, max_id = read_ent_id(path + "kg2_ent_ids") paths = path.split('/') test_path = "/".join([paths[1], paths[2], paths[3], "datasets", paths[7], paths[8], paths[9]]) test_ids = [] with open('/' + test_path + r"/test_links", 'r', encoding='utf-8') as f: for line in f.readlines(): items = line.strip().split("\t") id1, id2 = int(ent2id1[items[0]]), int(ent2id2[items[1]]) # maxx_id = max(maxx_id, id1, id2) test_ids.append([id1, id2]) data_ids = test_ids if data_ids is "dbp15k": # train_ids = read_ids(path+"sup_ent_ids") test_ids = read_ids(path + "ref_ent_ids") # 只考虑测试集上匹配 # test_ids.extend(train_ids) data_ids = test_ids data_ids = np.array(data_ids).astype(int) entity_embedding = entity_embedding.astype(np.float32) if metric == "euclidean": entity_embedding -= np.mean(entity_embedding, axis=0) Lvec = np.array([entity_embedding[e] for e in data_ids[:, 0]]) Rvec = np.array([entity_embedding[e] for e in data_ids[:, 1]]) if os.path.exists(path + "mapping_mat.npy"): # OpenEA模型转换后的最终向量 mapping = np.load(path + "mapping_mat.npy") #print("mapping shape:", mapping.shape) Lvec = np.matmul(Lvec, mapping) #print("load mapping succussuflly!") if normalize: Lvec = preprocessing.normalize(Lvec, norm="l2", axis=1) Rvec = preprocessing.normalize(Rvec, norm="l2", axis=1) seed = 119417657 L_True = data_ids[:, 0].tolist() print("shape:", entity_embedding.shape) params_cp = falconn.LSHConstructionParameters() params_cp.dimension = entity_embedding.shape[1] if lsh_family == "crosspolytope": params_cp.lsh_family = falconn.LSHFamily.CrossPolytope elif lsh_family == "hyperplane": params_cp.lsh_family = falconn.LSHFamily.Hyperplane if metric == "euclidean": params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared elif metric == "inner": params_cp.distance_function = falconn.DistanceFunction.NegativeInnerProduct params_cp.l = number_of_tables params_cp.num_rotations = 1 params_cp.seed = seed # we want to use all the available threads to set up params_cp.num_setup_threads = 2 params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable # we build 18-bit hashes so that each table has # 2^18 bins; this is a good choise since 2^18 is of the same # order of magnitude as the number of data points falconn.compute_number_of_hash_functions(20, params_cp) # print('Constructing the LSH table') t1 = timeit.default_timer() table = falconn.LSHIndex(params_cp) table.setup(Lvec) t2 = timeit.default_timer() print('Construction time: {}'.format(t2 - t1)) query_object = table.construct_query_object() number_of_probes = number_of_tables print('Choosing number of probes: ', number_of_probes) query_object.set_num_probes(number_of_probes) t1 = timeit.default_timer() true_cnt = 0 total = 0 true_all = data_ids.shape[0] node_pairs = [] print("Metric:", metric, "Threshold:", threshold) for ids_index, pair in enumerate(data_ids): ans = query_object.find_near_neighbors(Rvec[ids_index], threshold=threshold) #print(len(ans)) for index in range(len(ans)): if pair[0] == L_True[ans[index]]: true_cnt += 1 node_pairs.append((pair[0], pair[1], 1)) else: node_pairs.append((L_True[ans[index]], pair[1], 0)) total += len(ans) print('Threshold:[%f] True cnt:[%d] Generate All cnt:[%d] Total:[%d] Recall:[%f] P/E ratio:[%f] Metric:[%s]' % (threshold, true_cnt, total, true_all, true_cnt/true_all, total/true_all, metric)) t2 = timeit.default_timer() print('Generate Candidate time: {}'.format(t2 - t1)) if output_path == True: output_path = "/".join(path.split('/')[:-1]) + '/topk_' + str(threshold) + '_name_ngram' print('output path:', output_path) with open(output_path, 'w', encoding='utf8') as f: for pair in node_pairs: f.writelines(pair[0] + '\t' + pair[1] + '\t' + str(pair[2]) + '\n')
# queries -= center # print('Done') #assert dataset.dtype == np.float32 number_of_probes = [900] # params_cp_blue = falconn.LSHConstructionParameters() params_cp_blue.dimension = len(dataset_blue[0]) params_cp_blue.lsh_family = falconn.LSHFamily.CrossPolytope params_cp_blue.distance_function = falconn.DistanceFunction.EuclideanSquared params_cp_blue.l = number_of_tables params_cp_blue.num_rotations = 1 params_cp_blue.seed = 666666 params_cp_blue.num_setup_threads = 1 params_cp_blue.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable falconn.compute_number_of_hash_functions(20, params_cp_blue) print('Constructing the LSH table') t1 = timeit.default_timer() table_blue = falconn.LSHIndex(params_cp_blue) table_blue.setup(dataset_blue) t2 = timeit.default_timer() query_object_blue = table_blue.construct_query_object() print('Done') print('Construction time: {}'.format((t2 - t1))) params_cp_green = falconn.LSHConstructionParameters() params_cp_green.dimension = len(dataset_green[0]) params_cp_green.lsh_family = falconn.LSHFamily.CrossPolytope params_cp_green.distance_function = falconn.DistanceFunction.EuclideanSquared params_cp_green.l = number_of_tables
# In[ ]: import falconn # In[ ]: parameters = falconn.LSHConstructionParameters() num_tables = 1 parameters.l = num_tables parameters.dimension = num_dimensions parameters.distance_function = falconn.DistanceFunction.EuclideanSquared parameters.lsh_family = falconn.LSHFamily.CrossPolytope parameters.num_rotations = 1 parameters.num_setup_threads = 1 parameters.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable falconn.compute_number_of_hash_functions(16, parameters) index = falconn.LSHIndex(parameters) get_ipython().run_line_magic('time', 'index.setup(dataset)') query_object = index.construct_query_object() num_probes = 1 query_object.set_num_probes(num_probes) get_ipython().run_line_magic( 'timeit', 'query_object.find_k_nearest_neighbors(query, 5)') # In[ ]: query = dataset[5000] print(query_object.find_k_nearest_neighbors(query, 5))
def set_clustering_LSH_Index(self, number_of_queries, query_accuracy, number_of_tables, hash_bit): #Function defintion: Returns the LSH Index -- Read LSH for more information or # README.2 #parameters #number_of_queries:The number of queries used to determine the number_of_probes #query_accuracy: Specifies the level of accuracy of the Index #Setting query_accuracy = 1 degenerates LSH index into linear search. #number_of_tables:the number of hash_tables used for a given nearest #neighbor search #hash_bit: Used to determine the number of hash functions. READ_ME for detail. print("Setting Clustering Index") queries = self.w2v_vectors[(len(self.w2v_vectors)-number_of_queries):] w2v_vectors = self.w2v_vectors[:(len(self.w2v_vectors)-number_of_queries)] #Normalize vectors center = np.mean(w2v_vectors, axis=0) w2v_vectors -= center queries -= center #perform linear scan to return correct answers answers = self.linearScan_answerGenerator(w2v_vectors, queries) #Set number of probes---- print('Choosing number of probes') init_number_of_probes = 600 # END ------- #Parameters ----- params_cp = falconn.LSHConstructionParameters() params_cp.dimension = len(w2v_vectors[0]) # = 50 for Glove6B.50d params_cp.lsh_family = falconn.LSHFamily.CrossPolytope params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared params_cp.l = number_of_tables params_cp.num_rotations = 1 params_cp.seed = 5721840 params_cp.num_setup_threads = 0 params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable falconn.compute_number_of_hash_functions(hash_bit, params_cp) # END ------ #Constructing LSH Index ----- print('Constructing the LSH Index') t1 = timeit.default_timer() table = falconn.LSHIndex(params_cp) table.setup(w2v_vectors) t2 = timeit.default_timer() print('Done') print('Construction time: {}'.format(t2 - t1)) query_object = table.construct_query_object() number_of_probes = self.probeGenerator(query_accuracy, init_number_of_probes, query_object, answers, queries, number_of_tables) query_object.set_num_probes(number_of_probes) #-------- # Performance Statistics t1 = timeit.default_timer() score = 0 for (i, query) in enumerate(queries): if query_object.find_nearest_neighbor(query) == answers[i]: score += 1 t2 = timeit.default_timer() print('Query time: {}'.format((t2 - t1) / len(queries))) print('Precision: {}'.format(float(score) / len(queries))) self.query_object = query_object print("Vectors Successfully Hashed. Clustering LSH Index Created")