def index_user_vectors(): print 'Performing indexing with HashPermutations...' global engine_perm t0 = time.time() print k_dimen, d_dimen rbp_perm = RandomBinaryProjections('rbp_perm', d_dimen) rbp_perm.reset(k_dimen) # Create permutations meta-hash permutations = HashPermutations('permut') rbp_conf = {'num_permutation':50,'beam_size':10,'num_neighbour':250} # Add rbp as child hash of permutations hash permutations.add_child_hash(rbp_perm, rbp_conf) # Create engine engine_perm = Engine(k_dimen, lshashes=[permutations], distance=CosineDistance()) for u in user_vector: engine_perm.store_vector(user_vector[u], data=u) # Then update permuted index permutations.build_permuted_index() t1 = time.time() print 'Indexing took %f seconds', (t1-t0)
def index_user_vectors(): #print 'Performing indexing with HashPermutations...' global engine_perm t0 = time.time() #print k_dimen, d_dimen rbp_perm = RandomBinaryProjections('rbp_perm', d_dimen) rbp_perm.reset(k_dimen) # Create permutations meta-hash permutations = HashPermutations('permut') rbp_conf = {'num_permutation': 50, 'beam_size': 10, 'num_neighbour': 250} # Add rbp as child hash of permutations hash permutations.add_child_hash(rbp_perm, rbp_conf) # Create engine engine_perm = Engine(k_dimen, lshashes=[permutations], distance=CosineDistance()) for u in user_vector: engine_perm.store_vector(user_vector[u], data=u) # Then update permuted index permutations.build_permuted_index() t1 = time.time()
def __init__(self, feature_file, dimension, neighbour, lsh_project_num): self.feature_file = feature_file self.dimension = dimension self.neighbour = neighbour self.face_feature = defaultdict(str) self.ground_truth = defaultdict(int) # Create permutations meta-hash permutations2 = HashPermutationMapper('permut2') tmp_feature = defaultdict(str) with open(feature_file, 'rb') as f: reader = csv.reader(f, delimiter=' ') for name, feature in reader: tmp_feature[name] = feature matrix = [] label = [] for item in tmp_feature.keys(): v = map(float, tmp_feature[item].split(',')) matrix.append(np.array(v)) label.append(item) random.shuffle(matrix) print 'PCA matric : ', len(matrix) rbp_perm2 = PCABinaryProjections('testPCABPHash', lsh_project_num, matrix) permutations2.add_child_hash(rbp_perm2) # Create engine nearest = NearestFilter(self.neighbour) self.engine = Engine(self.dimension, lshashes=[permutations2], distance=CosineDistance(), vector_filters=[nearest])
def LSH(Layers, K): lsh_vectors = database[:, LSH_VECT_START_COL:] video_data = database[:, 0:5] num_rows, num_cols = lsh_vectors.shape dimension = num_cols rbp = list() for i in range(Layers): rbp.append(RandomBinaryProjections(str(i), K)) # Create engine with pipeline configuration engine = Engine(dimension, lshashes=rbp) # Index 1000000 random vectors (set their data zo a unique string) for index in range(num_rows): v = lsh_vectors[index, :] meta_data = str(index)+',' + str(int(video_data[index, 0])) + ', ' + str(int(video_data[index, 1])) + ', ' + str(int(video_data[index, 2])) \ + ', ' + str(video_data[index, 3]) + ', ' + str(video_data[index, 4]) engine.store_vector(v, meta_data) printOutput(engine.storage.buckets) print 'stop'
def knn(data,k): assert k<=len(data)-1, 'The number of neighbors must be smaller than the data cardinality (minus one)' k=k+1 n,dimension = data.shape ind = [] dist = [] if(dimension<10): rbp = RandomBinaryProjections('rbp', dimension) else: rbp = RandomBinaryProjections('rbp',10) engine = Engine(dimension, lshashes=[rbp], vector_filters=[NearestFilter(k)]) for i in range(n): engine.store_vector(data[i], i) for i in range(n): N = engine.neighbours(data[i]) ind.append([x[1] for x in N][1:]) dist.append([x[2] for x in N][1:]) return N,dist,ind
class StateDBEngine(object): def __init__(self): # initialize "nearby" library self.dim = 4 self.rbp = RandomBinaryProjections('rbp', 100) self.engine = Engine(self.dim, lshashes=[self.rbp]) # performance counter self.counter = 0 def add(self, x, data): # print 'add data = ', data self.engine.store_vector(x, data) self.counter += 1 def lookup(self, x, THRESHOLD=0.1): naver = self.engine.neighbours(x) if len(naver) == 0: return None pt, data, d = naver[0] # print 'lhs, rhs', x, pt, # print 'd = ', d, (d < THRESHOLD), (data is None) if d < THRESHOLD: return data else: return None
def index_in_text_engine(nid_gen, tfidf, lsh_projections, tfidf_is_dense=False): num_features = tfidf.shape[1] print("TF-IDF shape: " + str(tfidf.shape)) text_engine = Engine(num_features, lshashes=[lsh_projections], distance=CosineDistance()) st = time.time() row_idx = 0 for key in nid_gen: if tfidf_is_dense: dense_row = tfidf[row_idx] array = dense_row else: sparse_row = tfidf.getrow(row_idx) dense_row = sparse_row.todense() array = dense_row.A[0] row_idx += 1 text_engine.store_vector(array, key) et = time.time() print("Total index text: " + str((et - st))) return text_engine
def setUp(self): logging.basicConfig(level=logging.WARNING) numpy.random.seed(11) # Create permutations meta-hash self.permutations = HashPermutations('permut') # Create binary hash as child hash rbp = RandomBinaryProjections('rbp1', 4, rand_seed=19) rbp_conf = { 'num_permutation': 50, 'beam_size': 10, 'num_neighbour': 100 } # Add rbp as child hash of permutations hash self.permutations.add_child_hash(rbp, rbp_conf) # Create engine with meta hash and cosine distance self.engine_perm = Engine(200, lshashes=[self.permutations], distance=CosineDistance()) # Create engine without permutation meta-hash self.engine = Engine(200, lshashes=[rbp], distance=CosineDistance())
def test_random_discretized_projections(self): dim = 4 vector_count = 5000 vectors = numpy.random.randn(dim, vector_count) # First get recall and precision for one 1-dim random hash rdp = RandomDiscretizedProjections('rdp', 1, 0.01) nearest = NearestFilter(10) engine = Engine(dim, lshashes=[rdp], vector_filters=[nearest]) exp = RecallPrecisionExperiment(10, vectors) result = exp.perform_experiment([engine]) recall1 = result[0][0] precision1 = result[0][1] searchtime1 = result[0][2] print('\nRecall RDP: %f, Precision RDP: %f, SearchTime RDP: %f\n' % \ (recall1, precision1, searchtime1)) # Then get recall and precision for one 4-dim random hash rdp = RandomDiscretizedProjections('rdp', 2, 0.2) engine = Engine(dim, lshashes=[rdp], vector_filters=[nearest]) result = exp.perform_experiment([engine]) recall2 = result[0][0] precision2 = result[0][1] searchtime2 = result[0][2] print('\nRecall RDP: %f, Precision RDP: %f, SearchTime RDP: %f\n' % \ (recall2, precision2, searchtime2)) # Many things are random here, but the precision should increase # with dimension self.assertTrue(precision2 > precision1)
class LSH: def __init__(self, path, dataSize): self.path = path self.dataSize = dataSize def preprocess(self): ids = [] meta = [] data = [] for i in range(self.dataSize): with open(self.path + str(i) + ".data", "rb") as file: f_song_id = pickle.load(file) f_songMeta = pickle.load(file) f_data = pickle.load(file) ids.append(f_song_id) meta.append(f_songMeta) data.append(f_data) self.id = np.array(ids) self.meta = np.array(meta) self.data = np.array(data) def generate_hashtable(self): self.engine = Engine(self.data.shape[1], lshashes=[RandomBinaryProjections('rbp', 20)]) for i in range(self.dataSize): self.engine.store_vector(self.data[i], data=self.id[i]) def query(self, data): return self.engine.neighbours(data)
def loadHashmap(self, feature_size, result_n): # Create redis storage adapter redis_object = Redis(host='localhost', port=6379, db=0) redis_storage = RedisStorage(redis_object) pdb.set_trace() try: # Get hash config from redis config = redis_storage.load_hash_configuration('test') # Config is existing, create hash with None parameters lshash = RandomBinaryProjections(None, None) # Apply configuration loaded from redis lshash.apply_config(config) except: # Config is not existing, create hash from scratch, with 10 projections lshash = RandomBinaryProjections('test', 0) # Create engine for feature space of 100 dimensions and use our hash. # This will set the dimension of the lshash only the first time, not when # using the configuration loaded from redis. Use redis storage to store # buckets. nearest = NearestFilter(1000) #self.engine = Engine(feature_size, lshashes=[], vector_filters=[]) pdb.set_trace() self.engine = Engine(192, lshashes=[lshash], vector_filters=[nearest], storage=redis_storage, distance=EuclideanDistance()) # Do some stuff like indexing or querying with the engine... # Finally store hash configuration in redis for later use redis_storage.store_hash_configuration(lshash)
def knn(data, k): assert k <= len( data ) - 1, 'The number of neighbors must be smaller than the data cardinality (minus one)' k = k + 1 n, dimension = data.shape ind = [] dist = [] if (dimension < 10): rbp = RandomBinaryProjections('rbp', dimension) else: rbp = RandomBinaryProjections('rbp', 10) engine = Engine(dimension, lshashes=[rbp], vector_filters=[NearestFilter(k)]) for i in range(n): engine.store_vector(data[i], i) for i in range(n): N = engine.neighbours(data[i]) ind.append([x[1] for x in N][1:]) dist.append([x[2] for x in N][1:]) return N, dist, ind
def RunAnnNearpy(q): totalTimer = Timer() # Load input dataset. Log.Info("Loading dataset", self.verbose) queryData = np.genfromtxt(self.dataset[1], delimiter=',') train, label = SplitTrainData(self.dataset) with totalTimer: # Get all the parameters. try: # Perform Approximate Nearest-Neighbors dimension = train.shape[1] rbp = RandomBinaryProjections('rbp', 10) engine = Engine(dimension, lshashes=[rbp]) for i in range(len(train)): engine.store_vector(train[i], 'data_%d' % i) for i in range(len(queryData)): v = engine.neighbours(queryData[i]) except Exception as e: Log.Info(e) q.put(e) return -1 time = totalTimer.ElapsedTime() q.put(time) return time
def test_nearpy(X_train, y_train, X_test, k): # We are looking for the k closest neighbours nearest = NearestFilter(k) X_train_normalized = [] for i in range(len(X_train)): train_example = X_train[i] element = ((train_example / np.linalg.norm(train_example)).tolist(), y_train[i].tolist()) X_train_normalized.append(element) engine = Engine(X_train.shape[1], lshashes=[RandomBinaryProjections('default', 10)], distance=CosineDistance(), vector_filters=[nearest]) #perform hashing for train examples for train_example in X_train: engine.store_vector(train_example) labels = [] for test_example in X_test: neighbors = engine.neighbours(test_example) labels.append([ train_example[1] for train_example in X_train_normalized if set(neighbors[0][0]) == set(train_example[0]) ]) return labels
class TestEngine(unittest.TestCase): def setUp(self): self.engine = Engine(1000) def test_retrieval(self): for k in range(100): self.engine.clean_all_buckets() x = numpy.random.randn(1000) x_data = 'data' self.engine.store_vector(x, x_data) n = self.engine.neighbours(x) y = n[0][0] y_data = n[0][1] y_distance = n[0][2] self.assertTrue((y == x).all()) self.assertEqual(y_data, x_data) self.assertEqual(y_distance, 0.0) def test_retrieval_sparse(self): for k in range(100): self.engine.clean_all_buckets() x = scipy.sparse.rand(1000, 1, density=0.05) x_data = 'data' self.engine.store_vector(x, x_data) n = self.engine.neighbours(x) y = n[0][0] y_data = n[0][1] y_distance = n[0][2] self.assertTrue((y - x).sum() == 0.0) self.assertEqual(y_data, x_data) self.assertEqual(y_distance, 0.0)
class PointCalculator(): def __init__(self, point_list, point): self.__configure_calculator(point_list, point) def __configure_calculator(self, point_list, point): # Dimension of our vector space self.__dimension__ = 2 # Create a random binary hash with 10 bits self.__rbp__ = RandomBinaryProjections('rbp', 10) # Create engine with pipeline configuration self.__engine__ = Engine(self.__dimension__, lshashes=[self.__rbp__]) self.set_searching_point_list(point_list) self.set_query_point(point) def __load_point_list_in_engine(self): for index in xrange(0, len(self.__point_list__)): v = numpy.array(self.__point_list__[index]) self.__engine__.store_vector(v, 'data_%d' % index) def set_searching_point_list(self, point_list): self.__point_list__ = point_list self.__load_point_list_in_engine() def set_query_point(self, point): self.__point__ = point def __get_nearest_point(self): return self.__engine__.neighbours(numpy.array(self.__point__)) def get_nearest_point_array_coords(self): nearest_point = self.__get_nearest_point() return [nearest_point[0][0][0], nearest_point[0][0][1]]
def test_retrieval(self): # We want 12 projections, 20 results at least rbpt = RandomBinaryProjectionTree('testHash', 12, 20) # Create engine for 100 dimensional feature space, do not forget to set # nearest filter to 20, because default is 10 self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)]) # First insert 200000 random vectors #print 'Indexing...' for k in range(200000): x = numpy.random.randn(100) x_data = 'data' self.engine.store_vector(x, x_data) # Now do random queries and check result set size #print 'Querying...' for k in range(10): x = numpy.random.randn(100) n = self.engine.neighbours(x) #print "Candidate count = %d" % self.engine.candidate_count(x) #print "Result size = %d" % len(n) self.assertEqual(len(n), 20)
class RandomBinaryNN(NearestNeighbor): """ Nearest neighbor implementation by using random binary trees from nearpy package """ def __init__(self, dimension: int, number_projections: int, threshold: float): """ :param dimension: Number of dimensions of input points :param number_projections: Number of random projections used for finding nearest neighbors. Trade-off: More projections result in a smaller number of false positives in candidate set :param threshold: Distance threshold for definition nearest: all points within this specific distance """ self.rbp = RandomBinaryProjections('rbp', number_projections) self.sqdist = SquaredEuclideanDistance() self.ann_engine = Engine( dimension, lshashes=[self.rbp], distance=self.sqdist, vector_filters=[DistanceThresholdFilter(threshold)]) def insert_candidate(self, point: np.ndarray, metadata): self.ann_engine.store_vector(point, data=metadata) def get_candidates(self, point: np.ndarray): return [ NearestNeighborResult(res[0], res[1], res[2]) for res in self.ann_engine.neighbours(point) ]
def __init__(self, num_features, projection_count=30): self.num_features = num_features #self.rbp = RandomDiscretizedProjections('default', projection_count, bin_width=100) self.rbp = RandomBinaryProjections('default', projection_count) #self.rbp = RandomBinaryProjectionTree('default', projection_count, 1) self.text_engine = Engine(num_features, lshashes=[self.rbp], distance=CosineDistance())
def main(args): """ Main entry. """ data = Dataset(args.dataset) num, dim = data.base.shape # We are looking for the ten closest neighbours nearest = NearestFilter(args.topk) # We want unique candidates unique = UniqueFilter() # Create engines for all configurations for nbit, ntbl in itertools.product(args.nbits, args.ntbls): logging.info("Creating Engine ...") lshashes = [RandomBinaryProjections('rbp%d' % i, nbit) for i in xrange(ntbl)] # Create engine with this configuration engine = Engine(dim, lshashes=lshashes, vector_filters=[unique, nearest]) logging.info("\tDone!") logging.info("Adding items ...") for i in xrange(num): engine.store_vector(data.base[i, :], i) if i % 100000 == 0: logging.info("\t%d/%d" % (i, data.nbae)) logging.info("\tDone!") ids = np.zeros((data.nqry, args.topk), np.int) logging.info("Searching ...") tic() for i in xrange(data.nqry): reti = [y for x, y, z in np.array(engine.neighbours(data.query[i]))] ids[i, :len(reti)] = reti if i % 100 == 0: logging.info("\t%d/%d" % (i, data.nqry)) time_costs = toc() logging.info("\tDone!") report = os.path.join(args.exp_dir, "report.txt") with open(report, "a") as rptf: rptf.write("*" * 64 + "\n") rptf.write("* %s\n" % time.asctime()) rptf.write("*" * 64 + "\n") r_at_k = compute_stats(data.groundtruth, ids, args.topk)[-1][-1] with open(report, "a") as rptf: rptf.write("=" * 64 + "\n") rptf.write("index_%s-nbit_%d-ntbl_%d\n" % ("NearPy", nbit, ntbl)) rptf.write("-" * 64 + "\n") rptf.write("recall@%-8d%.4f\n" % (args.topk, r_at_k)) rptf.write("time cost (ms): %.3f\n" % (time_costs * 1000 / data.nqry))
def __init__(self, x): self.n, self.f = x.shape # Use NearPy lsh for fast ann rbp = RandomBinaryProjections('rbp', 10) self.engine = Engine(self.f, lshashes=[rbp]) for i in np.arange(self.n): v = x[i, :] self.engine.store_vector(v, i)
def k_nn_lsh_2(k, word, decade_matrix, index_dict): num_rows = decade_matrix.get_shape()[0] print("the number of rows:" + str(num_rows)) rbp = RandomBinaryProjections('rbp', 256) engine = Engine(num_rows, lshashes=[rbp]) for i in range(num_rows): print(i) engine.store_vector(decade_matrix.getrow(i), "data_%d" % i) return engine.neighbours(word)
def fit(self, X): b = self.params['b'] self.n, self.f = X.shape # Use NearPy lsh for fast ann rbp = RandomBinaryProjections('rbp', b) self.engine = Engine(self.f, lshashes=[rbp]) for i in np.arange(self.n): v = np.squeeze(np.copy(X[i, :])) self.engine.store_vector(v, i)
class CFiltering: def __init__(self, matrix, max_neighbours=20, lshashes=[RandomBinaryProjections("rbp", 10)], vector_filters=[UniqueFilter()], distance=Pearson()): if not isinstance(lshashes, list): raise TypeError("'lshashes' must be an instance of 'list'") if not isinstance(vector_filters, list): raise TypeError("'vector_filters' must be an instance of 'list'") self.underlying = Engine(len(matrix[0]), lshashes=lshashes, vector_filters=vector_filters + [NearestFilter(max_neighbours)], distance=distance) for vector in matrix: self.underlying.store_vector(vector) def predict(self, vector, precision): neighbours = self.underlying.neighbours(vector) if not neighbours: raise ValueError("Failed to acquire any neighbours") average = [ sum(neighbour) / len(neighbour) for neighbour, _, _ in neighbours ] avg = sum(vector) / len(vector) for i in range(len(vector)): if vector[i] < precision: weighted_sum = 0 for j, neighbour in enumerate(neighbours): neighbour, _, similarity = neighbour weighted_sum += similarity * (neighbour[j] - average[j]) vector[i] = avg + weighted_sum / len(vector) return vector
class GenerateHashTable(): def __init__(self, measure="EuclideanDistance", data_path='data/classed_data/'): self.res = ResnetSimilarity() self.pbar = ProgressBar() # Dimension of our vector space self.dimension = 2048 self.data_path = data_path # Create a random binary hash with 10 bits self.rbp = RandomBinaryProjections('rbp', 10) self.measure = measure self.msote = MemoryStorage() if measure == "EuclideanDistance": self.engine = Engine(self.dimension, lshashes=[self.rbp], storage=self.msote, distance=EuclideanDistance()) else: self.engine = Engine(self.dimension, lshashes=[self.rbp], storage=self.msote, distance=CosineDistance()) def generate_table(self): if self.measure == "CosineDistance": save_path = "hashed_objects/hashed_object_Cosine.pkl" elif self.measure == "EuclideanDistance": save_path = "hashed_objects/hashed_object_euclidean.pkl" else: save_path = "hashed_objects/" + str(self.measure) + ".pkl" count = 0 for subdir, dirs, files in os.walk(self.data_path): for file in files: if '.jpg' in file: img_path = os.path.join(subdir, file) img = Image.open(img_path).convert('RGB') if img.size[0] >= 100: img_emb = self.res.getMapping(img) img_emb = img_emb.view(-1, 2048) img_emb = img_emb.numpy() self.engine.store_vector(img_emb[0], img_path) if count % 1000 == 0: print("Saving Image Embedding ", count) count += 1 print("Saving File To", save_path) # TODO this is peculiar filehandler = open(save_path, 'wb') pickle.dump(self.engine, filehandler)
def __init__(self, emb_path, feature='title'): self.emb_path = emb_path self.feature = feature self.data_df = None self.tfidf = Vectorizer(**get_tfidf_params()) self.fasttext_embedder = None self.fasttext_tfidf = None self.dimension = 300 rbp = RandomBinaryProjections('rbp', 2) self.engine = Engine(self.dimension, lshashes=[rbp]) pass
def __configure_calculator(self, point_list, point): # Dimension of our vector space self.__dimension__ = 2 # Create a random binary hash with 10 bits self.__rbp__ = RandomBinaryProjections('rbp', 10) # Create engine with pipeline configuration self.__engine__ = Engine(self.__dimension__, lshashes=[self.__rbp__]) self.set_searching_point_list(point_list) self.set_query_point(point)
def get_engine(self, vocab, vecs): logging.info('{} hash functions'.format(self.args.projections)) hashes = [PCABinaryProjections('ne1v', self.args.projections, vecs[:1000,:].T)] engine = Engine(vecs.shape[1], lshashes=hashes, distance=[], vector_filters=[]) for ind, vec in enumerate(vecs): if not ind % 100000: logging.info( '{} words added to nearpy engine'.format(ind)) engine.store_vector(vec, ind) return engine
def test_storage_issue(self): engine1 = Engine(100) engine2 = Engine(100) for k in range(1000): x = numpy.random.randn(100) x_data = 'data' engine1.store_vector(x, x_data) # Each engine should have its own default storage self.assertTrue(len(engine2.storage.buckets) == 0)
def build_index(self, X): f = X.shape[1] n = X.shape[0] rbp = RandomBinaryProjections('rbp', 32) engine = Engine(f, lshashes=[rbp]) for i in range(n): engine.store_vector(X[i], 'data_%d' % i) return engine
def test_storage_issue(self): engine1 = Engine(100) engine2 = Engine(100) for k in range(1000): x = numpy.random.randn(100) x_data = 'data' engine1.store_vector(x, x_data) # Each engine should have its own default storage self.assertTrue(len(engine2.storage.buckets)==0)
def __init__(self, data_points, sim_threshold=0.5, num_vectors=3): self.data_points = data_points self.point_num = self.data_points.shape[0] self.dimension = self.data_points.shape[1] - 1 # Create a random binary hash with . bits self.rbp = RandomBinaryProjections('rbp', num_vectors, rand_seed=42) self.engine = Engine( self.dimension, lshashes=[self.rbp], vector_filters=[DistanceThresholdFilter(1 - sim_threshold)]) for i in range(self.point_num): self.engine.store_vector(self.data_points[i, 1:], '%d' % i)
class Neighbors: """ Nearest neighbors. """ def __init__(self, config, verbose=True, log_file=None): # set up logger self._logger = Logger.get_logger(self.__class__.__name__, log_file=log_file, silence=(not verbose), global_log_file=verbose) # read config self._parse_config(config) self._engine = None def _parse_config(self, config): self._num_neighbors = config["num_neighbors"] def _build_engine(self, dimension): # build NearPy engine self._logger.info("Building engine...") self._engine = Engine( dimension, vector_filters=[NearestFilter(self._num_neighbors)]) def store(self, vectors, data=None, log_freq=10, verbose=True): self._logger.info("Storing vectors...") if data is not None: assert vectors.shape[0] == len( data), "Dim 0 of vectors and data must match!" if self._engine is None: self._build_engine(vectors.shape[-1]) num_vectors = vectors.shape[0] for idx in xrange(num_vectors): if verbose and idx % log_freq == 0: self._logger.info("Storing vector {} of {}...".format( idx, num_vectors)) if data is not None: self._engine.store_vector(vectors[idx], data[idx]) else: self._engine.store_vector(vectors[idx]) def predict(self, vectors, log_freq=10, verbose=True): self._logger.info("Predicting...") num_vectors = vectors.shape[0] neighbors = [] for idx in xrange(num_vectors): if verbose and idx % log_freq == 0: self._logger.info("Predicting vector {} of {}...".format( idx, num_vectors)) neighbors.append(self._engine.neighbours(vectors[idx])) return neighbors
def _create_engine(self, k, lshashes=None): self.k_ = k self.engine_ = Engine(self.dimension_, lshashes, distance=self.dist_metric_, vector_filters=[NearestFilter(k)]) for i, feature in enumerate(self.featurized_): if self.transpose_: self.engine_.store_vector(feature.T, i) else: self.engine_.store_vector(feature, i)
def fit(self, X, y=None, hash="randbinary"): X = np.array(X) assert len(X.shape) == 2, "X not 2-rank" dimension = X.shape[-1] if hash == "randbinary": rbp = RandomBinaryProjections('rbp', 10) elif hash == "pcabinary": rbp = PCABinaryProjections('rbp', 10, training_set=X) self.engine = Engine(dimension, lshashes=[rbp]) index = 0 for x in X: self.engine.store_vector(x, str(index)) index += 1
class LSHIndex(Index): def __init__(self, hasher, number_of_tables=6, length_of_tables=12, match_thresh=0.2, association_thresh=0.1, storage=memoryStorage): """ :param hasher: @type hasher: Hasher """ Index.__init__(self, hasher, number_of_tables=number_of_tables, length_of_tables=length_of_tables, match_thresh=match_thresh, association_thresh=association_thresh) self.hasher = hasher self.match_thresh = match_thresh self.association_thresh = association_thresh self.tables = [None]*number_of_tables for i in range(number_of_tables): self.tables[i] = RandomBinaryProjections(str(i), length_of_tables) self.engine = Engine(self.hasher.dims(), lshashes=self.tables, storage=storage(), fetch_vector_filters=[NoVectorFilter()]) def index(self, id, img): item = self.hasher.hash(id, img) for i in range(len(item.descriptors)): self.engine.store_vector(item.descriptors[i],data=(id, item.keypoints[i], item.descriptors[i])) return item def find(self, id, img, index_if_not_found=False): item = self.hasher.hash(id, img) matches = {} #count_min =self.association_thresh * float(len(item.descriptors)) for x in item.descriptors: for neighbour in self.engine.neighbours(x): if neighbour[1][0] in matches: continue y = neighbour[1][2] dist = l2norm(x, y) key = neighbour[1][0] if dist < self.match_thresh: #if dist > 0.0001: # print('{} {} {}'.format(id, neighbour[1][0], dist)) matches[key] = (matches[key] + 1) if key in matches else 1 if id not in matches and index_if_not_found: for i in range(len(item.descriptors)): self.engine.store_vector(item.descriptors[i], data=(id, item.keypoints[i], item.descriptors[i])) #for id, count in matches.items(): # #if count >= count_min: # yield id return list(matches.keys())
def _build_rdp_engine(self,matrix,rdp,normals): # Dimension of our vector space dimension = np.shape(matrix)[1] n = np.shape(matrix)[0] # Create a random binary hash with 10 bits # Create engine with pipeline configuration engine = Engine(dimension, lshashes=[rdp],storage = MemoryStorage()) rdp.vectors = normals for index in range(n): v = matrix[index] engine.store_vector(v, '%d' % index) return engine
def test_storage_memory(self): # We want 10 projections, 20 results at least rbpt = RandomBinaryProjectionTree('testHash', 10, 20) # Create engine for 100 dimensional feature space self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)]) # First insert 2000 random vectors for k in range(2000): x = numpy.random.randn(100) x_data = 'data' self.engine.store_vector(x, x_data) self.memory.store_hash_configuration(rbpt) rbpt2 = RandomBinaryProjectionTree(None, None, None) rbpt2.apply_config(self.memory.load_hash_configuration('testHash')) self.assertEqual(rbpt.dim, rbpt2.dim) self.assertEqual(rbpt.hash_name, rbpt2.hash_name) self.assertEqual(rbpt.projection_count, rbpt2.projection_count) for i in range(rbpt.normals.shape[0]): for j in range(rbpt.normals.shape[1]): self.assertEqual(rbpt.normals[i, j], rbpt2.normals[i, j]) # Now do random queries and check result set size for k in range(10): x = numpy.random.randn(100) keys1 = rbpt.hash_vector(x, querying=True) keys2 = rbpt2.hash_vector(x, querying=True) self.assertEqual(len(keys1), len(keys2)) for k in range(len(keys1)): self.assertEqual(keys1[k], keys2[k])
def __init__(self): # initialize "nearby" library self.dim = 4 self.rbp = RandomBinaryProjections('rbp', 100) self.engine = Engine(self.dim, lshashes=[self.rbp]) # performance counter self.counter = 0
def load_DL(self,vector_set): rbp = RandomBinaryProjections('rbp',10) self.engine_ = Engine(self.biggest, lshashes=[rbp]) for i in range(len(list(self.training_))): vector=vector_set[:,i] vector=np.reshape(vector,(self.biggest,1)) vector=self.DL_[-1].transform(vector) self.engine_.store_vector(vector[:,0],self.training_[i])
def load_KPCA(self,vector_set): rbp = RandomBinaryProjections('rbp',10) self.engine_ = Engine(self.KPCA_.alphas_.shape[1], lshashes=[rbp]) transformed_vectors = self.KPCA_.transform(vector_set.T) for i in range(len(list(self.training_))): #vector=vector_set[:,i] #vector=np.reshape(vector,(self.biggest,1)) #vector=self.KPCA_.transform(vector) self.engine_.store_vector(transformed_vectors[i,:], self.training_[i])
def setUp(self): logging.basicConfig(level=logging.WARNING) # Create permutations meta-hash self.permutations = HashPermutations('permut') # Create binary hash as child hash rbp = RandomBinaryProjections('rbp1', 4) rbp_conf = {'num_permutation':50,'beam_size':10,'num_neighbour':100} # Add rbp as child hash of permutations hash self.permutations.add_child_hash(rbp, rbp_conf) # Create engine with meta hash and cosine distance self.engine_perm = Engine(200, lshashes=[self.permutations], distance=CosineDistance()) # Create engine without permutation meta-hash self.engine = Engine(200, lshashes=[rbp], distance=CosineDistance())
def test_sparse(): dim = 500 num_train = 1000 num_test = 1 train_data = ss.rand(dim, num_train)#pickle.load('/home/jmahler/Downloads/feature_objects.p') test_data = ss.rand(dim, num_test) rbp = RandomBinaryProjections('rbp', 10) engine = Engine(dim, lshashes=[rbp]) for i in range(num_train): engine.store_vector(train_data.getcol(i)) for j in range(num_test): N = engine.neighbours(test_data.getcol(j)) print N IPython.embed()
class TestEngine(unittest.TestCase): def setUp(self): self.engine = Engine(1000) def test_storage_issue(self): engine1 = Engine(100) engine2 = Engine(100) for k in range(1000): x = numpy.random.randn(100) x_data = 'data' engine1.store_vector(x, x_data) # Each engine should have its own default storage self.assertTrue(len(engine2.storage.buckets)==0) def test_retrieval(self): for k in range(100): self.engine.clean_all_buckets() x = numpy.random.randn(1000) x_data = 'data' self.engine.store_vector(x, x_data) n = self.engine.neighbours(x) y, y_data, y_distance = n[0] normalized_x = unitvec(x) delta = 0.000000001 self.assertAlmostEqual(numpy.abs((normalized_x - y)).max(), 0, delta=delta) self.assertEqual(y_data, x_data) self.assertAlmostEqual(y_distance, 0.0, delta=delta) def test_retrieval_sparse(self): for k in range(100): self.engine.clean_all_buckets() x = scipy.sparse.rand(1000, 1, density=0.05) x_data = 'data' self.engine.store_vector(x, x_data) n = self.engine.neighbours(x) y, y_data, y_distance = n[0] normalized_x = unitvec(x) delta = 0.000000001 self.assertAlmostEqual(numpy.abs((normalized_x - y)).max(), 0, delta=delta) self.assertEqual(y_data, x_data) self.assertAlmostEqual(y_distance, 0.0, delta=delta)
class TestEngine(unittest.TestCase): def setUp(self): self.engine = Engine(1000) def test_storage_issue(self): engine1 = Engine(100) engine2 = Engine(100) for k in range(1000): x = numpy.random.randn(100) x_data = 'data' engine1.store_vector(x, x_data) # Each engine should have its own default storage self.assertTrue(len(engine2.storage.buckets)==0) def test_retrieval(self): for k in range(100): self.engine.clean_all_buckets() x = numpy.random.randn(1000) x_data = 'data' self.engine.store_vector(x, x_data) n = self.engine.neighbours(x) y = n[0][0] y_data = n[0][1] y_distance = n[0][2] self.assertTrue((y == x).all()) self.assertEqual(y_data, x_data) self.assertEqual(y_distance, 0.0) def test_retrieval_sparse(self): for k in range(100): self.engine.clean_all_buckets() x = scipy.sparse.rand(1000, 1, density=0.05) x_data = 'data' self.engine.store_vector(x, x_data) n = self.engine.neighbours(x) y = n[0][0] y_data = n[0][1] y_distance = n[0][2] self.assertTrue((y - x).sum() == 0.0) self.assertEqual(y_data, x_data) self.assertEqual(y_distance, 0.0)
def load_PCA(self,vector_set): """reinitializes our engine and loads a numpy set of vectors of dimension (self.biggest,1) into self.engine_""" rbp = RandomBinaryProjections('rbp', 10) self.engine_ = Engine(self.PCA_.components_.shape[1], lshashes=[rbp]) transformed_vectors = self.PCA_.transform(vector_set.T) for i in range(len(list(self.training_))): #vector=vector_set[:,i] #vector=np.reshape(vector,(self.biggest,1)) #vector=self.PCA_.transform(vector) self.engine_.store_vector(transformed_vectors[i,:], self.training_[i])
class lshsearcher: def __init__(self): self.__dimension = None self.__engine_perm = None self.__permutations = None def _set_confval(self, dimension=None): if dimension is None: return None else: self.__dimension = dimension def _engine_on(self): # Create permutations meta-hash self.__permutations = HashPermutations('permut') # Create binary hash as child hash rbp_perm = RandomBinaryProjections('rbp_perm', 14) rbp_conf = {'num_permutation':50,'beam_size':10,'num_neighbour':100} # Add rbp as child hash of permutations hash self.__permutations.add_child_hash(rbp_perm, rbp_conf) # Create engine self.__engine_perm = Engine(self.__dimension, lshashes=[self.__permutations], distance=CosineDistance()) def conf(self, dimension): self._set_confval(dimension) self._engine_on() def getData(self, v): if self.__engine_perm is not None: self.__engine_perm.store_vector(v) def commitData(self): if self.__permutations is not None: self.__permutations.build_permuted_index() def find(self, v): if self.__engine_perm is not None: return self.__engine_perm.neighbours(v)
def _engine_on(self): # Create permutations meta-hash self.__permutations = HashPermutations('permut') # Create binary hash as child hash rbp_perm = RandomBinaryProjections('rbp_perm', 14) rbp_conf = {'num_permutation':50,'beam_size':10,'num_neighbour':100} # Add rbp as child hash of permutations hash self.__permutations.add_child_hash(rbp_perm, rbp_conf) # Create engine self.__engine_perm = Engine(self.__dimension, lshashes=[self.__permutations], distance=CosineDistance())
class NearPy(NearestNeighbor): def __init__(self, dist=EuclideanDistance(), phi=lambda x: x): NearestNeighbor.__init__(self, dist, phi) def _create_engine(self, k, lshashes=None): self.k_ = k self.engine_ = Engine(self.dimension_, lshashes, distance=self.dist_metric_, vector_filters=[NearestFilter(k)]) for i, feature in enumerate(self.featurized_): if self.transpose_: self.engine_.store_vector(feature.T, i) else: self.engine_.store_vector(feature, i) def train(self, data, k=10): self.data_ = np.array(data) self.featurized_ = self.featurize(data) shape = featurized[0].shape assert len(shape) <= 2, 'Feature shape must be (1, N), (N, 1), or (N,)' if len(shape) == 1: self.transpose_ = False self.dimension_ = shape[0] else: assert 1 in shape, 'Feature shape must be (1, N) or (N, 1)' self.transpose_ = (shape[0] == 1) self.dimension_ = shape[1] if self.transpose_ else shape[0] logging.info('Constructing nearest neighbor data structure.') train_start = time.clock() self._create_engine(k) train_end = time.clock() # logging.info('Took %f sec' %(train_end - train_start)) def within_distance(x, dist=0.5, return_indices=False): raise NotImplementedError def nearest_neighbors(self, x, k, return_indices=False): # HACK: load all data back into new engine if k doesn't match if k != self.k_: self._create_engine(k) feature = self.phi_(x) if self.transpose_: query_result = self.engine_.neighbours(feature.T) else: query_result = self.engine_.neighbours(feature) if len(query_result) == 0: return [], [] features, indices, distances = zip(*query_result) if return_indices: return list(indices), list(distances) else: indices = np.array(indices) return list(self.data_[indices]), list(distances)
class TestPermutation(unittest.TestCase): def setUp(self): logging.basicConfig(level=logging.WARNING) # Create permutations meta-hash self.permutations = HashPermutations('permut') # Create binary hash as child hash rbp = RandomBinaryProjections('rbp1', 4) rbp_conf = {'num_permutation':50,'beam_size':10,'num_neighbour':100} # Add rbp as child hash of permutations hash self.permutations.add_child_hash(rbp, rbp_conf) # Create engine with meta hash and cosine distance self.engine_perm = Engine(200, lshashes=[self.permutations], distance=CosineDistance()) # Create engine without permutation meta-hash self.engine = Engine(200, lshashes=[rbp], distance=CosineDistance()) def test_runnable(self): # First index some random vectors matrix = numpy.zeros((1000,200)) for i in xrange(1000): v = numpy.random.randn(200) matrix[i] = v self.engine.store_vector(v) self.engine_perm.store_vector(v) # Then update permuted index self.permutations.build_permuted_index() # Do random query on engine with permutations meta-hash print '\nNeighbour distances with permuted index:' query = numpy.random.randn(200) results = self.engine_perm.neighbours(query) dists = [x[2] for x in results] print dists # Do random query on engine without permutations meta-hash print '\nNeighbour distances without permuted index (distances should be larger):' results = self.engine.neighbours(query) dists = [x[2] for x in results] print dists # Real neighbours print '\nReal neighbour distances:' query = query.reshape((1,200)) dists = CosineDistance().distance_matrix(matrix,query) dists = dists.reshape((-1,)) dists = sorted(dists) print dists[:10]
def test_retrieval(self): # We want 12 projections, 20 results at least rbpt = RandomBinaryProjectionTree('testHash', 12, 20) # Create engine for 100 dimensional feature space, do not forget to set # nearest filter to 20, because default is 10 self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)]) # First insert 200000 random vectors for k in range(200000): x = numpy.random.randn(100) x_data = 'data {}'.format(k) self.engine.store_vector(x, x_data) # Now do random queries and check result set size for k in range(10): x = numpy.random.randn(100) n = self.engine.neighbours(x) self.assertEqual(len(n), 20)
def example2(): # Dimension of feature space DIM = 100 # Number of data points (dont do too much because of exact search) POINTS = 20000 ########################################################## print 'Performing indexing with HashPermutations...' t0 = time.time() # Create permutations meta-hash permutations = HashPermutations('permut') # Create binary hash as child hash rbp_perm = RandomBinaryProjections('rbp_perm', 14) rbp_conf = {'num_permutation':50,'beam_size':10,'num_neighbour':100} # Add rbp as child hash of permutations hash permutations.add_child_hash(rbp_perm, rbp_conf) # Create engine engine_perm = Engine(DIM, lshashes=[permutations], distance=CosineDistance()) # First index some random vectors matrix = numpy.zeros((POINTS,DIM)) for i in xrange(POINTS): v = numpy.random.randn(DIM) matrix[i] = v engine_perm.store_vector(v) # Then update permuted index permutations.build_permuted_index() t1 = time.time() print 'Indexing took %f seconds' % (t1-t0) # Get random query vector query = numpy.random.randn(DIM) # Do random query on engine 3 print '\nNeighbour distances with HashPermutations:' print ' -> Candidate count is %d' % engine_perm.candidate_count(query) results = engine_perm.neighbours(query) dists = [x[2] for x in results] print dists # Real neighbours print '\nReal neighbour distances:' query = query.reshape((1,DIM)) dists = CosineDistance().distance_matrix(matrix,query) dists = dists.reshape((-1,)) dists = sorted(dists) print dists[:10] ########################################################## print '\nPerforming indexing with HashPermutationMapper...' t0 = time.time() # Create permutations meta-hash permutations2 = HashPermutationMapper('permut2') # Create binary hash as child hash rbp_perm2 = RandomBinaryProjections('rbp_perm2', 14) # Add rbp as child hash of permutations hash permutations2.add_child_hash(rbp_perm2) # Create engine engine_perm2 = Engine(DIM, lshashes=[permutations2], distance=CosineDistance()) # First index some random vectors matrix = numpy.zeros((POINTS,DIM)) for i in xrange(POINTS): v = numpy.random.randn(DIM) matrix[i] = v engine_perm2.store_vector(v) t1 = time.time() print 'Indexing took %f seconds' % (t1-t0) # Get random query vector query = numpy.random.randn(DIM) # Do random query on engine 4 print '\nNeighbour distances with HashPermutationMapper:' print ' -> Candidate count is %d' % engine_perm2.candidate_count(query) results = engine_perm2.neighbours(query) dists = [x[2] for x in results] print dists # Real neighbours print '\nReal neighbour distances:' query = query.reshape((1,DIM)) dists = CosineDistance().distance_matrix(matrix,query) dists = dists.reshape((-1,)) dists = sorted(dists) print dists[:10] ########################################################## print '\nPerforming indexing with mutliple binary hashes...' t0 = time.time() hashes = [] for k in range(20): hashes.append(RandomBinaryProjections('rbp_%d' % k, 10)) # Create engine engine_rbps = Engine(DIM, lshashes=hashes, distance=CosineDistance()) # First index some random vectors matrix = numpy.zeros((POINTS,DIM)) for i in xrange(POINTS): v = numpy.random.randn(DIM) matrix[i] = v engine_rbps.store_vector(v) t1 = time.time() print 'Indexing took %f seconds' % (t1-t0) # Get random query vector query = numpy.random.randn(DIM) # Do random query on engine 4 print '\nNeighbour distances with mutliple binary hashes:' print ' -> Candidate count is %d' % engine_rbps.candidate_count(query) results = engine_rbps.neighbours(query) dists = [x[2] for x in results] print dists # Real neighbours print '\nReal neighbour distances:' query = query.reshape((1,DIM)) dists = CosineDistance().distance_matrix(matrix,query) dists = dists.reshape((-1,)) dists = sorted(dists) print dists[:10]
def test_delete_vector_with_provided_value(self): engine = Engine(self.dim, lshashes=[UniBucket('testHash')]) self.fill_engine(engine) engine.delete_vector(self.removed_value, self.removed_vector) self.check_delete(engine)
def setUp(self): self.engine = Engine(1000)
class LSHSearch: def __init__(self, feature_file, dimension, neighbour, lsh_project_num): self.feature_file = feature_file self.dimension = dimension self.neighbour = neighbour self.face_feature = defaultdict(str) self.ground_truth = defaultdict(int) # Create permutations meta-hash permutations2 = HashPermutationMapper('permut2') tmp_feature = defaultdict(str) with open(feature_file, 'rb') as f: reader = csv.reader(f, delimiter=' ') for name, feature in reader: tmp_feature[name] = feature matrix = [] label = [] for item in tmp_feature.keys(): v = map(float, tmp_feature[item].split(',')) matrix.append(np.array(v)) label.append(item) random.shuffle(matrix) print 'PCA matric : ', len(matrix) rbp_perm2 = PCABinaryProjections('testPCABPHash', lsh_project_num, matrix) permutations2.add_child_hash(rbp_perm2) # Create engine nearest = NearestFilter(self.neighbour) self.engine = Engine(self.dimension, lshashes=[permutations2], distance=CosineDistance(), vector_filters=[nearest]) def build(self): with open(self.feature_file, 'rb') as f: reader = csv.reader(f, delimiter=' ') for name, feature in reader: self.face_feature[name] = feature person = '_'.join(name.split('_')[:-1]) self.ground_truth[person] += 1 for item in self.face_feature.keys(): v = map(float, self.face_feature[item].split(',')) self.engine.store_vector(v, item) def query(self, person_list): dists = [] scores = [] for person in person_list: query = map(float, self.face_feature[person].split(',')) print '\nNeighbour distances with mutliple binary hashes:' print ' -> Candidate count is %d' % self.engine.candidate_count(query) results = self.engine.neighbours(query) dists = dists + [x[1] for x in results] scores = scores + [x[2] for x in results] t_num = [self.ground_truth['_'.join(x.split('_')[:-1])] for x in dists] res = zip(dists, scores, t_num) res.sort(key = lambda t: t[1]) res1 = self.f7(res, person_list) return res1[:self.neighbour] def true_num(self, person): return self.ground_truth[person] def f7(self, zip_seq, person_list): seen = set() seen_add = seen.add return [ x for x in zip_seq if not (x[0] in seen or seen_add(x[0]) or x[0] in person_list)]
class TestRandomBinaryProjectionTree(unittest.TestCase): def setUp(self): self.memory = MemoryStorage() self.redis_object = Redis(host='localhost', port=6379, db=0) self.redis_storage = RedisStorage(self.redis_object) def test_retrieval(self): # We want 12 projections, 20 results at least rbpt = RandomBinaryProjectionTree('testHash', 12, 20) # Create engine for 100 dimensional feature space, do not forget to set # nearest filter to 20, because default is 10 self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)]) # First insert 200000 random vectors #print 'Indexing...' for k in range(200000): x = numpy.random.randn(100) x_data = 'data' self.engine.store_vector(x, x_data) # Now do random queries and check result set size #print 'Querying...' for k in range(10): x = numpy.random.randn(100) n = self.engine.neighbours(x) #print "Candidate count = %d" % self.engine.candidate_count(x) #print "Result size = %d" % len(n) self.assertEqual(len(n), 20) def test_storage_memory(self): # We want 10 projections, 20 results at least rbpt = RandomBinaryProjectionTree('testHash', 10, 20) # Create engine for 100 dimensional feature space self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)]) # First insert 2000 random vectors for k in range(2000): x = numpy.random.randn(100) x_data = 'data' self.engine.store_vector(x, x_data) self.memory.store_hash_configuration(rbpt) rbpt2 = RandomBinaryProjectionTree(None, None, None) rbpt2.apply_config(self.memory.load_hash_configuration('testHash')) self.assertEqual(rbpt.dim, rbpt2.dim) self.assertEqual(rbpt.hash_name, rbpt2.hash_name) self.assertEqual(rbpt.projection_count, rbpt2.projection_count) for i in range(rbpt.normals.shape[0]): for j in range(rbpt.normals.shape[1]): self.assertEqual(rbpt.normals[i, j], rbpt2.normals[i, j]) # Now do random queries and check result set size for k in range(10): x = numpy.random.randn(100) keys1 = rbpt.hash_vector(x, querying=True) keys2 = rbpt2.hash_vector(x, querying=True) self.assertEqual(len(keys1), len(keys2)) for k in range(len(keys1)): self.assertEqual(keys1[k], keys2[k]) def test_storage_redis(self): # We want 10 projections, 20 results at least rbpt = RandomBinaryProjectionTree('testHash', 10, 20) # Create engine for 100 dimensional feature space self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)]) # First insert 2000 random vectors for k in range(2000): x = numpy.random.randn(100) x_data = 'data' self.engine.store_vector(x, x_data) self.redis_storage.store_hash_configuration(rbpt) rbpt2 = RandomBinaryProjectionTree(None, None, None) rbpt2.apply_config(self.redis_storage.load_hash_configuration('testHash')) self.assertEqual(rbpt.dim, rbpt2.dim) self.assertEqual(rbpt.hash_name, rbpt2.hash_name) self.assertEqual(rbpt.projection_count, rbpt2.projection_count) for i in range(rbpt.normals.shape[0]): for j in range(rbpt.normals.shape[1]): self.assertEqual(rbpt.normals[i, j], rbpt2.normals[i, j]) # Now do random queries and check result set size for k in range(10): x = numpy.random.randn(100) keys1 = rbpt.hash_vector(x, querying=True) keys2 = rbpt2.hash_vector(x, querying=True) self.assertEqual(len(keys1), len(keys2)) for k in range(len(keys1)): self.assertEqual(keys1[k], keys2[k])