def build(self, data, k, cp): n_items, vector_length = data.shape #print(data.shape) #parameters init method_param = init_method_param("nearpy", data=data, cp=cp) hash_counts = method_param["hash_counts"] n_bits = method_param["n_bits"] self.filter = NearestFilter(10) hashes = [] for k in range(hash_counts): nearpy_rbp = nearpy.hashes.RandomBinaryProjections( 'rbp_%d' % k, n_bits) hashes.append(nearpy_rbp) if self.metric == 'euclidean': dist = nearpy.distances.EuclideanDistance() self.index = nearpy.Engine( vector_length, lshashes=hashes, distance=dist, vector_filters=[self.filter]) else: # Default (angular) = Cosine distance self.index = nearpy.Engine( vector_length, lshashes=hashes, vector_filters=[self.filter]) #if self.metric == 'angular': #data = sklearn.preprocessing.normalize(data, axis=1, norm='l2') for i, x in enumerate(data): self.index.store_vector(x, i) # def query_train(self, data, k): self.filter.N = k #if self.metric == 'angular': #data = sklearn.preprocessing.normalize([data], axis=1, norm='l2')[0] neighbors = np.empty((data.shape[0],k), dtype=int) distances = np.empty((data.shape[0],k)) for i in range(len(data)): item_single = self.index.neighbours(data[i]) dp_n = [] dp_d = [] for j in range(len(item_single)): dp_n.append(item_single[j][1]) dp_d.append(item_single[j][2]) neighbors[i] = np.asarray(dp_n) distances[i] = np.asarray(dp_d) return neighbors, distances
def fit(self, X): import nearpy hashes = [] for k in xrange(self._hash_counts): nearpy_rbp = nearpy.hashes.RandomBinaryProjections('rbp_%d' % k, self._n_bits) hashes.append(nearpy_rbp) if self._metric == 'euclidean': dist = nearpy.distances.EuclideanDistance() self._nearpy_engine = nearpy.Engine(X.shape[1], lshashes=hashes, distance=dist) else: # Default (angular) = Cosine distance self._nearpy_engine = nearpy.Engine(X.shape[1], lshashes=hashes) if self._metric == 'angular': X = sklearn.preprocessing.normalize(X, axis=1, norm='l2') for i, x in enumerate(X): self._nearpy_engine.store_vector(x.tolist(), i)
def fit(self, X): hashes = [] for k in xrange(self._hash_counts): nearpy_rbp = nearpy.hashes.RandomBinaryProjections( 'rbp_%d' % k, self._n_bits) hashes.append(nearpy_rbp) self._nearpy_engine = nearpy.Engine(X.shape[1], lshashes=hashes) for i, x in enumerate(X): self._nearpy_engine.store_vector(x.tolist(), i)
def __LSH__(self, global_features, query_global_desc, buckets=5, n_neighbors=3): engines = {} engines[buckets] = nearpy.Engine(global_features.shape[1], lshashes=[nearpy.hashes.RandomBinaryProjections('rbp', buckets)]) for i, v in enumerate(global_features): engines[buckets].store_vector(v, '%d'%i) indices = [] for d in tqdm.tqdm(query_global_desc, total=query_global_desc.shape[0]): nbr = engines[buckets].neighbours(d) if len(nbr) > (n_neighbors): indices.append(np.array([int(n[1]) for n in nbr])) else: b = buckets while (len(nbr) <= n_neighbors and b > 1): b = b // 2 if b not in engines: print('Create new engine with {:d} buckets'.format(b)) engines[b] = nearpy.Engine(global_features.shape[1], lshashes=[nearpy.hashes.RandomBinaryProjections('rbp', b)]) for i, v in enumerate(global_features): engines[b].store_vector(v, '%d'%i) nbr = engines[b].neighbours(d) indices.append(np.array([int(n[1]) for n in nbr])) return np.array(indices)
def fit(self, X): import nearpy, nearpy.hashes, nearpy.distances hashes = [] # TODO: doesn't seem like the NearPy code is using the metric?? for k in xrange(self._hash_counts): nearpy_rbp = nearpy.hashes.RandomBinaryProjections( 'rbp_%d' % k, self._n_bits) hashes.append(nearpy_rbp) self._nearpy_engine = nearpy.Engine(X.shape[1], lshashes=hashes) for i, x in enumerate(X): self._nearpy_engine.store_vector(x.tolist(), i)
def build_lsh_engine(orig, window_size, number_of_hashes, hash_dimensions): # Build the ngram vectors using rolling windows. # Variables named `*_win_vectors` contain vectors for # the given input, such that each row is the vector # for a single window. Successive windows overlap # at all words except for the first and last. orig_vectors = mk_vectors(orig) orig_win_vectors = numpy.array([ orig_vectors[i:i + window_size, :].ravel() for i in range(orig_vectors.shape[0] - window_size + 1) ]) # Initialize the approximate nearest neighbor search algorithm. # This creates the search "engine" and populates its index with # the window-vectors from the original script. We can then pass # over the window-vectors from a fan work, taking each vector # and searching for good matches in the engine's index of script # text. # We could do the search in the opposite direction, storing # fan text in the engine's index, and passing over window- # vectors from the original script, searching for matches in # the index of fan text. Unfortuantely, the quality of the # matches found goes down when you add too many values to the # engine's index. vector_dim = orig_win_vectors.shape[1] hashes = [] for i in range(number_of_hashes): h = nearpy.hashes.RandomBinaryProjections('rbp{}'.format(i), hash_dimensions) hashes.append(h) engine = nearpy.Engine(vector_dim, lshashes=hashes, distance=nearpy.distances.CosineDistance()) for ix, row in enumerate(orig_win_vectors): engine.store_vector(row, (ix, str(orig[ix:ix + window_size]))) return engine