class TestEngine(unittest.TestCase): def setUp(self): self.engine = Engine(1000) def test_retrieval(self): for k in range(100): self.engine.clean_all_buckets() x = numpy.random.randn(1000) x_data = 'data' self.engine.store_vector(x, x_data) n = self.engine.neighbours(x) y = n[0][0] y_data = n[0][1] y_distance = n[0][2] self.assertTrue((y == x).all()) self.assertEqual(y_data, x_data) self.assertEqual(y_distance, 0.0) def test_retrieval_sparse(self): for k in range(100): self.engine.clean_all_buckets() x = scipy.sparse.rand(1000, 1, density=0.05) x_data = 'data' self.engine.store_vector(x, x_data) n = self.engine.neighbours(x) y = n[0][0] y_data = n[0][1] y_distance = n[0][2] self.assertTrue((y - x).sum() == 0.0) self.assertEqual(y_data, x_data) self.assertEqual(y_distance, 0.0)
class NearPy(NearestNeighbor): def __init__(self, dist=EuclideanDistance(), phi=lambda x: x): NearestNeighbor.__init__(self, dist, phi) def _create_engine(self, k, lshashes=None): self.k_ = k self.engine_ = Engine(self.dimension_, lshashes, distance=self.dist_metric_, vector_filters=[NearestFilter(k)]) for i, feature in enumerate(self.featurized_): if self.transpose_: self.engine_.store_vector(feature.T, i) else: self.engine_.store_vector(feature, i) def train(self, data, k=10): self.data_ = np.array(data) self.featurized_ = self.featurize(data) shape = featurized[0].shape assert len(shape) <= 2, 'Feature shape must be (1, N), (N, 1), or (N,)' if len(shape) == 1: self.transpose_ = False self.dimension_ = shape[0] else: assert 1 in shape, 'Feature shape must be (1, N) or (N, 1)' self.transpose_ = (shape[0] == 1) self.dimension_ = shape[1] if self.transpose_ else shape[0] logging.info('Constructing nearest neighbor data structure.') train_start = time.clock() self._create_engine(k) train_end = time.clock() # logging.info('Took %f sec' %(train_end - train_start)) def within_distance(x, dist=0.5, return_indices=False): raise NotImplementedError def nearest_neighbors(self, x, k, return_indices=False): # HACK: load all data back into new engine if k doesn't match if k != self.k_: self._create_engine(k) feature = self.phi_(x) if self.transpose_: query_result = self.engine_.neighbours(feature.T) else: query_result = self.engine_.neighbours(feature) if len(query_result) == 0: return [], [] features, indices, distances = zip(*query_result) if return_indices: return list(indices), list(distances) else: indices = np.array(indices) return list(self.data_[indices]), list(distances)
class TestPermutation(unittest.TestCase): def setUp(self): logging.basicConfig(level=logging.WARNING) # Create permutations meta-hash self.permutations = HashPermutations('permut') # Create binary hash as child hash rbp = RandomBinaryProjections('rbp1', 4) rbp_conf = { 'num_permutation': 50, 'beam_size': 10, 'num_neighbour': 100 } # Add rbp as child hash of permutations hash self.permutations.add_child_hash(rbp, rbp_conf) # Create engine with meta hash and cosine distance self.engine_perm = Engine(200, lshashes=[self.permutations], distance=CosineDistance()) # Create engine without permutation meta-hash self.engine = Engine(200, lshashes=[rbp], distance=CosineDistance()) def test_runnable(self): # First index some random vectors matrix = numpy.zeros((1000, 200)) for i in xrange(1000): v = numpy.random.randn(200) matrix[i] = v self.engine.store_vector(v) self.engine_perm.store_vector(v) # Then update permuted index self.permutations.build_permuted_index() # Do random query on engine with permutations meta-hash print '\nNeighbour distances with permuted index:' query = numpy.random.randn(200) results = self.engine_perm.neighbours(query) dists = [x[2] for x in results] print dists # Do random query on engine without permutations meta-hash print '\nNeighbour distances without permuted index (distances should be larger):' results = self.engine.neighbours(query) dists = [x[2] for x in results] print dists # Real neighbours print '\nReal neighbour distances:' query = query.reshape((1, 200)) dists = CosineDistance().distance_matrix(matrix, query) dists = dists.reshape((-1, )) dists = sorted(dists) print dists[:10]
class TestPermutation(unittest.TestCase): def setUp(self): logging.basicConfig(level=logging.WARNING) # Create permutations meta-hash self.permutations = HashPermutations('permut') # Create binary hash as child hash rbp = RandomBinaryProjections('rbp1', 4) rbp_conf = {'num_permutation':50,'beam_size':10,'num_neighbour':100} # Add rbp as child hash of permutations hash self.permutations.add_child_hash(rbp, rbp_conf) # Create engine with meta hash and cosine distance self.engine_perm = Engine(200, lshashes=[self.permutations], distance=CosineDistance()) # Create engine without permutation meta-hash self.engine = Engine(200, lshashes=[rbp], distance=CosineDistance()) def test_runnable(self): # First index some random vectors matrix = numpy.zeros((1000,200)) for i in xrange(1000): v = numpy.random.randn(200) matrix[i] = v self.engine.store_vector(v) self.engine_perm.store_vector(v) # Then update permuted index self.permutations.build_permuted_index() # Do random query on engine with permutations meta-hash print '\nNeighbour distances with permuted index:' query = numpy.random.randn(200) results = self.engine_perm.neighbours(query) dists = [x[2] for x in results] print dists # Do random query on engine without permutations meta-hash print '\nNeighbour distances without permuted index (distances should be larger):' results = self.engine.neighbours(query) dists = [x[2] for x in results] print dists # Real neighbours print '\nReal neighbour distances:' query = query.reshape((1,200)) dists = CosineDistance().distance_matrix(matrix,query) dists = dists.reshape((-1,)) dists = sorted(dists) print dists[:10]
class TestPermutation(unittest.TestCase): def setUp(self): logging.basicConfig(level=logging.WARNING) numpy.random.seed(11) # Create permutations meta-hash self.permutations = HashPermutations('permut') # Create binary hash as child hash rbp = RandomBinaryProjections('rbp1', 4, rand_seed=19) rbp_conf = { 'num_permutation': 50, 'beam_size': 10, 'num_neighbour': 100 } # Add rbp as child hash of permutations hash self.permutations.add_child_hash(rbp, rbp_conf) # Create engine with meta hash and cosine distance self.engine_perm = Engine(200, lshashes=[self.permutations], distance=CosineDistance()) # Create engine without permutation meta-hash self.engine = Engine(200, lshashes=[rbp], distance=CosineDistance()) def test_runnable(self): # First index some random vectors matrix = numpy.zeros((1000, 200)) for i in xrange(1000): v = numpy.random.randn(200) matrix[i] = v self.engine.store_vector(v) self.engine_perm.store_vector(v) # Then update permuted index self.permutations.build_permuted_index() # Do random query on engine with permutations meta-hash query = numpy.random.randn(200) results = self.engine_perm.neighbours(query) permuted_dists = [x[2] for x in results] # Do random query on engine without permutations meta-hash (distances # should be larger):' results = self.engine.neighbours(query) dists = [x[2] for x in results] self.assertLess(permuted_dists[0], dists[0])
class RandomBinaryNN(NearestNeighbor): """ Nearest neighbor implementation by using random binary trees from nearpy package """ def __init__(self, dimension: int, number_projections: int, threshold: float): """ :param dimension: Number of dimensions of input points :param number_projections: Number of random projections used for finding nearest neighbors. Trade-off: More projections result in a smaller number of false positives in candidate set :param threshold: Distance threshold for definition nearest: all points within this specific distance """ self.rbp = RandomBinaryProjections('rbp', number_projections) self.sqdist = SquaredEuclideanDistance() self.ann_engine = Engine( dimension, lshashes=[self.rbp], distance=self.sqdist, vector_filters=[DistanceThresholdFilter(threshold)]) def insert_candidate(self, point: np.ndarray, metadata): self.ann_engine.store_vector(point, data=metadata) def get_candidates(self, point: np.ndarray): return [ NearestNeighborResult(res[0], res[1], res[2]) for res in self.ann_engine.neighbours(point) ]
def test_nearpy(X_train, y_train, X_test, k): # We are looking for the k closest neighbours nearest = NearestFilter(k) X_train_normalized = [] for i in range(len(X_train)): train_example = X_train[i] element = ((train_example / np.linalg.norm(train_example)).tolist(), y_train[i].tolist()) X_train_normalized.append(element) engine = Engine(X_train.shape[1], lshashes=[RandomBinaryProjections('default', 10)], distance=CosineDistance(), vector_filters=[nearest]) #perform hashing for train examples for train_example in X_train: engine.store_vector(train_example) labels = [] for test_example in X_test: neighbors = engine.neighbours(test_example) labels.append([ train_example[1] for train_example in X_train_normalized if set(neighbors[0][0]) == set(train_example[0]) ]) return labels
def knn(data,k): assert k<=len(data)-1, 'The number of neighbors must be smaller than the data cardinality (minus one)' k=k+1 n,dimension = data.shape ind = [] dist = [] if(dimension<10): rbp = RandomBinaryProjections('rbp', dimension) else: rbp = RandomBinaryProjections('rbp',10) engine = Engine(dimension, lshashes=[rbp], vector_filters=[NearestFilter(k)]) for i in range(n): engine.store_vector(data[i], i) for i in range(n): N = engine.neighbours(data[i]) ind.append([x[1] for x in N][1:]) dist.append([x[2] for x in N][1:]) return N,dist,ind
class LSH: def __init__(self, path, dataSize): self.path = path self.dataSize = dataSize def preprocess(self): ids = [] meta = [] data = [] for i in range(self.dataSize): with open(self.path + str(i) + ".data", "rb") as file: f_song_id = pickle.load(file) f_songMeta = pickle.load(file) f_data = pickle.load(file) ids.append(f_song_id) meta.append(f_songMeta) data.append(f_data) self.id = np.array(ids) self.meta = np.array(meta) self.data = np.array(data) def generate_hashtable(self): self.engine = Engine(self.data.shape[1], lshashes=[RandomBinaryProjections('rbp', 20)]) for i in range(self.dataSize): self.engine.store_vector(self.data[i], data=self.id[i]) def query(self, data): return self.engine.neighbours(data)
class StateDBEngine(object): def __init__(self): # initialize "nearby" library self.dim = 4 self.rbp = RandomBinaryProjections('rbp', 100) self.engine = Engine(self.dim, lshashes=[self.rbp]) # performance counter self.counter = 0 def add(self, x, data): # print 'add data = ', data self.engine.store_vector(x, data) self.counter += 1 def lookup(self, x, THRESHOLD=0.1): naver = self.engine.neighbours(x) if len(naver) == 0: return None pt, data, d = naver[0] # print 'lhs, rhs', x, pt, # print 'd = ', d, (d < THRESHOLD), (data is None) if d < THRESHOLD: return data else: return None
class PointCalculator(): def __init__(self, point_list, point): self.__configure_calculator(point_list, point) def __configure_calculator(self, point_list, point): # Dimension of our vector space self.__dimension__ = 2 # Create a random binary hash with 10 bits self.__rbp__ = RandomBinaryProjections('rbp', 10) # Create engine with pipeline configuration self.__engine__ = Engine(self.__dimension__, lshashes=[self.__rbp__]) self.set_searching_point_list(point_list) self.set_query_point(point) def __load_point_list_in_engine(self): for index in xrange(0, len(self.__point_list__)): v = numpy.array(self.__point_list__[index]) self.__engine__.store_vector(v, 'data_%d' % index) def set_searching_point_list(self, point_list): self.__point_list__ = point_list self.__load_point_list_in_engine() def set_query_point(self, point): self.__point__ = point def __get_nearest_point(self): return self.__engine__.neighbours(numpy.array(self.__point__)) def get_nearest_point_array_coords(self): nearest_point = self.__get_nearest_point() return [nearest_point[0][0][0], nearest_point[0][0][1]]
class ImageSimilarity(): def __init__(self, distanceMeasure="EuclideanDistance"): self.res_similar = ResnetSimilarity() dimension = 2048 rbp = RandomBinaryProjections('rbp', 10) self.engine = Engine(dimension, lshashes=[rbp]) if distanceMeasure == "EuclideanDistance": self.filehandler = open("hashed_objects/hashed_object_euclidean.pkl", 'rb') elif distanceMeasure == "Test": self.filehandler = open("hashed_objects/hashed_object_example.pkl", 'rb') else: self.filehandler = open("hashed_objects/hashed_object_Cosine.pkl", 'rb') self.engine = pickle.load(self.filehandler) self.filehandler.close() print("Hash Table Loaded") def query(self, image): result = [] image_emb = self.res_similar.getMapping(image) image_emb = image_emb.view(-1, 2048) image_emb = image_emb.numpy() N = self.engine.neighbours(image_emb[0]) for i in range(len(N)): result.append(N[i][1]) if i == 5: break return result def tearDown(self): self.filehandler.close()
def knn(data, k): assert k <= len( data ) - 1, 'The number of neighbors must be smaller than the data cardinality (minus one)' k = k + 1 n, dimension = data.shape ind = [] dist = [] if (dimension < 10): rbp = RandomBinaryProjections('rbp', dimension) else: rbp = RandomBinaryProjections('rbp', 10) engine = Engine(dimension, lshashes=[rbp], vector_filters=[NearestFilter(k)]) for i in range(n): engine.store_vector(data[i], i) for i in range(n): N = engine.neighbours(data[i]) ind.append([x[1] for x in N][1:]) dist.append([x[2] for x in N][1:]) return N, dist, ind
def RunAnnNearpy(q): totalTimer = Timer() # Load input dataset. Log.Info("Loading dataset", self.verbose) queryData = np.genfromtxt(self.dataset[1], delimiter=',') train, label = SplitTrainData(self.dataset) with totalTimer: # Get all the parameters. try: # Perform Approximate Nearest-Neighbors dimension = train.shape[1] rbp = RandomBinaryProjections('rbp', 10) engine = Engine(dimension, lshashes=[rbp]) for i in range(len(train)): engine.store_vector(train[i], 'data_%d' % i) for i in range(len(queryData)): v = engine.neighbours(queryData[i]) except Exception as e: Log.Info(e) q.put(e) return -1 time = totalTimer.ElapsedTime() q.put(time) return time
class TestEngine(unittest.TestCase): def setUp(self): self.engine = Engine(1000) def test_storage_issue(self): engine1 = Engine(100) engine2 = Engine(100) for k in range(1000): x = numpy.random.randn(100) x_data = 'data' engine1.store_vector(x, x_data) # Each engine should have its own default storage self.assertEqual(len(engine2.storage.buckets), 0) def test_retrieval(self): for k in range(100): self.engine.clean_all_buckets() x = numpy.random.randn(1000) x_data = 'data' self.engine.store_vector(x, x_data) n = self.engine.neighbours(x) y, y_data, y_distance = n[0] normalized_x = unitvec(x) delta = 0.000000001 self.assertAlmostEqual(numpy.abs((normalized_x - y)).max(), 0, delta=delta) self.assertEqual(y_data, x_data) self.assertAlmostEqual(y_distance, 0.0, delta=delta) def test_retrieval_sparse(self): for k in range(100): self.engine.clean_all_buckets() x = scipy.sparse.rand(1000, 1, density=0.05) x_data = 'data' self.engine.store_vector(x, x_data) n = self.engine.neighbours(x) y, y_data, y_distance = n[0] normalized_x = unitvec(x) delta = 0.000000001 self.assertAlmostEqual(numpy.abs((normalized_x - y)).max(), 0, delta=delta) self.assertEqual(y_data, x_data) self.assertAlmostEqual(y_distance, 0.0, delta=delta)
def main(args): """ Main entry. """ data = Dataset(args.dataset) num, dim = data.base.shape # We are looking for the ten closest neighbours nearest = NearestFilter(args.topk) # We want unique candidates unique = UniqueFilter() # Create engines for all configurations for nbit, ntbl in itertools.product(args.nbits, args.ntbls): logging.info("Creating Engine ...") lshashes = [RandomBinaryProjections('rbp%d' % i, nbit) for i in xrange(ntbl)] # Create engine with this configuration engine = Engine(dim, lshashes=lshashes, vector_filters=[unique, nearest]) logging.info("\tDone!") logging.info("Adding items ...") for i in xrange(num): engine.store_vector(data.base[i, :], i) if i % 100000 == 0: logging.info("\t%d/%d" % (i, data.nbae)) logging.info("\tDone!") ids = np.zeros((data.nqry, args.topk), np.int) logging.info("Searching ...") tic() for i in xrange(data.nqry): reti = [y for x, y, z in np.array(engine.neighbours(data.query[i]))] ids[i, :len(reti)] = reti if i % 100 == 0: logging.info("\t%d/%d" % (i, data.nqry)) time_costs = toc() logging.info("\tDone!") report = os.path.join(args.exp_dir, "report.txt") with open(report, "a") as rptf: rptf.write("*" * 64 + "\n") rptf.write("* %s\n" % time.asctime()) rptf.write("*" * 64 + "\n") r_at_k = compute_stats(data.groundtruth, ids, args.topk)[-1][-1] with open(report, "a") as rptf: rptf.write("=" * 64 + "\n") rptf.write("index_%s-nbit_%d-ntbl_%d\n" % ("NearPy", nbit, ntbl)) rptf.write("-" * 64 + "\n") rptf.write("recall@%-8d%.4f\n" % (args.topk, r_at_k)) rptf.write("time cost (ms): %.3f\n" % (time_costs * 1000 / data.nqry))
class TestEngine(unittest.TestCase): def setUp(self): self.engine = Engine(1000) def test_storage_issue(self): engine1 = Engine(100) engine2 = Engine(100) for k in range(1000): x = numpy.random.randn(100) x_data = 'data' engine1.store_vector(x, x_data) # Each engine should have its own default storage self.assertTrue(len(engine2.storage.buckets)==0) def test_retrieval(self): for k in range(100): self.engine.clean_all_buckets() x = numpy.random.randn(1000) x_data = 'data' self.engine.store_vector(x, x_data) n = self.engine.neighbours(x) y = n[0][0] y_data = n[0][1] y_distance = n[0][2] self.assertTrue((y == x).all()) self.assertEqual(y_data, x_data) self.assertEqual(y_distance, 0.0) def test_retrieval_sparse(self): for k in range(100): self.engine.clean_all_buckets() x = scipy.sparse.rand(1000, 1, density=0.05) x_data = 'data' self.engine.store_vector(x, x_data) n = self.engine.neighbours(x) y = n[0][0] y_data = n[0][1] y_distance = n[0][2] self.assertTrue((y - x).sum() == 0.0) self.assertEqual(y_data, x_data) self.assertEqual(y_distance, 0.0)
class TestEngine(unittest.TestCase): def setUp(self): self.engine = Engine(1000) def test_storage_issue(self): engine1 = Engine(100) engine2 = Engine(100) for k in range(1000): x = numpy.random.randn(100) x_data = 'data' engine1.store_vector(x, x_data) # Each engine should have its own default storage self.assertTrue(len(engine2.storage.buckets)==0) def test_retrieval(self): for k in range(100): self.engine.clean_all_buckets() x = numpy.random.randn(1000) x_data = 'data' self.engine.store_vector(x, x_data) n = self.engine.neighbours(x) y, y_data, y_distance = n[0] normalized_x = unitvec(x) delta = 0.000000001 self.assertAlmostEqual(numpy.abs((normalized_x - y)).max(), 0, delta=delta) self.assertEqual(y_data, x_data) self.assertAlmostEqual(y_distance, 0.0, delta=delta) def test_retrieval_sparse(self): for k in range(100): self.engine.clean_all_buckets() x = scipy.sparse.rand(1000, 1, density=0.05) x_data = 'data' self.engine.store_vector(x, x_data) n = self.engine.neighbours(x) y, y_data, y_distance = n[0] normalized_x = unitvec(x) delta = 0.000000001 self.assertAlmostEqual(numpy.abs((normalized_x - y)).max(), 0, delta=delta) self.assertEqual(y_data, x_data) self.assertAlmostEqual(y_distance, 0.0, delta=delta)
def k_nn_lsh_2(k, word, decade_matrix, index_dict): num_rows = decade_matrix.get_shape()[0] print("the number of rows:" + str(num_rows)) rbp = RandomBinaryProjections('rbp', 256) engine = Engine(num_rows, lshashes=[rbp]) for i in range(num_rows): print(i) engine.store_vector(decade_matrix.getrow(i), "data_%d" % i) return engine.neighbours(word)
class TestEngine(unittest.TestCase): def setUp(self): self.engine = Engine(1000) def test_storage_issue(self): engine1 = Engine(100) engine2 = Engine(100) for k in range(1000): x = numpy.random.randn(100) x_data = 'data' engine1.store_vector(x, x_data) # Each engine should have its own default storage self.assertTrue(len(engine2.storage.buckets) == 0) def test_retrieval(self): for k in range(100): self.engine.clean_all_buckets() x = numpy.random.randn(1000) x_data = 'data' self.engine.store_vector(x, x_data) n = self.engine.neighbours(x) y = n[0][0] y_data = n[0][1] y_distance = n[0][2] self.assertTrue((y == x).all()) self.assertEqual(y_data, x_data) self.assertEqual(y_distance, 0.0) def test_retrieval_sparse(self): for k in range(100): self.engine.clean_all_buckets() x = scipy.sparse.rand(1000, 1, density=0.05) x_data = 'data' self.engine.store_vector(x, x_data) n = self.engine.neighbours(x) y = n[0][0] y_data = n[0][1] y_distance = n[0][2] self.assertTrue((y - x).sum() == 0.0) self.assertEqual(y_data, x_data) self.assertEqual(y_distance, 0.0)
class CFiltering: def __init__(self, matrix, max_neighbours=20, lshashes=[RandomBinaryProjections("rbp", 10)], vector_filters=[UniqueFilter()], distance=Pearson()): if not isinstance(lshashes, list): raise TypeError("'lshashes' must be an instance of 'list'") if not isinstance(vector_filters, list): raise TypeError("'vector_filters' must be an instance of 'list'") self.underlying = Engine(len(matrix[0]), lshashes=lshashes, vector_filters=vector_filters + [NearestFilter(max_neighbours)], distance=distance) for vector in matrix: self.underlying.store_vector(vector) def predict(self, vector, precision): neighbours = self.underlying.neighbours(vector) if not neighbours: raise ValueError("Failed to acquire any neighbours") average = [ sum(neighbour) / len(neighbour) for neighbour, _, _ in neighbours ] avg = sum(vector) / len(vector) for i in range(len(vector)): if vector[i] < precision: weighted_sum = 0 for j, neighbour in enumerate(neighbours): neighbour, _, similarity = neighbour weighted_sum += similarity * (neighbour[j] - average[j]) vector[i] = avg + weighted_sum / len(vector) return vector
class Neighbors: """ Nearest neighbors. """ def __init__(self, config, verbose=True, log_file=None): # set up logger self._logger = Logger.get_logger(self.__class__.__name__, log_file=log_file, silence=(not verbose), global_log_file=verbose) # read config self._parse_config(config) self._engine = None def _parse_config(self, config): self._num_neighbors = config["num_neighbors"] def _build_engine(self, dimension): # build NearPy engine self._logger.info("Building engine...") self._engine = Engine( dimension, vector_filters=[NearestFilter(self._num_neighbors)]) def store(self, vectors, data=None, log_freq=10, verbose=True): self._logger.info("Storing vectors...") if data is not None: assert vectors.shape[0] == len( data), "Dim 0 of vectors and data must match!" if self._engine is None: self._build_engine(vectors.shape[-1]) num_vectors = vectors.shape[0] for idx in xrange(num_vectors): if verbose and idx % log_freq == 0: self._logger.info("Storing vector {} of {}...".format( idx, num_vectors)) if data is not None: self._engine.store_vector(vectors[idx], data[idx]) else: self._engine.store_vector(vectors[idx]) def predict(self, vectors, log_freq=10, verbose=True): self._logger.info("Predicting...") num_vectors = vectors.shape[0] neighbors = [] for idx in xrange(num_vectors): if verbose and idx % log_freq == 0: self._logger.info("Predicting vector {} of {}...".format( idx, num_vectors)) neighbors.append(self._engine.neighbours(vectors[idx])) return neighbors
class LSHIndex(Index): def __init__(self, hasher, number_of_tables=6, length_of_tables=12, match_thresh=0.2, association_thresh=0.1, storage=memoryStorage): """ :param hasher: @type hasher: Hasher """ Index.__init__(self, hasher, number_of_tables=number_of_tables, length_of_tables=length_of_tables, match_thresh=match_thresh, association_thresh=association_thresh) self.hasher = hasher self.match_thresh = match_thresh self.association_thresh = association_thresh self.tables = [None]*number_of_tables for i in range(number_of_tables): self.tables[i] = RandomBinaryProjections(str(i), length_of_tables) self.engine = Engine(self.hasher.dims(), lshashes=self.tables, storage=storage(), fetch_vector_filters=[NoVectorFilter()]) def index(self, id, img): item = self.hasher.hash(id, img) for i in range(len(item.descriptors)): self.engine.store_vector(item.descriptors[i],data=(id, item.keypoints[i], item.descriptors[i])) return item def find(self, id, img, index_if_not_found=False): item = self.hasher.hash(id, img) matches = {} #count_min =self.association_thresh * float(len(item.descriptors)) for x in item.descriptors: for neighbour in self.engine.neighbours(x): if neighbour[1][0] in matches: continue y = neighbour[1][2] dist = l2norm(x, y) key = neighbour[1][0] if dist < self.match_thresh: #if dist > 0.0001: # print('{} {} {}'.format(id, neighbour[1][0], dist)) matches[key] = (matches[key] + 1) if key in matches else 1 if id not in matches and index_if_not_found: for i in range(len(item.descriptors)): self.engine.store_vector(item.descriptors[i], data=(id, item.keypoints[i], item.descriptors[i])) #for id, count in matches.items(): # #if count >= count_min: # yield id return list(matches.keys())
def start(dataset, test_vector, num_nearest=5): # Create a random binary hash with 10 bits rbp = RandomBinaryProjections('rbp', 10) # Create engine with pipeline configuration engine = Engine(dataset.shape, lshashes=[rbp]) # Index 1000000 random vectors (set their data to a unique string) for i, v in dataset: engine.store_vector(v, 'data_%d' % i) # Get nearest neighbours N = engine.neighbours(test_vector)
class lshNN(NNs): """ Locality-sensitive hashing by random projection consider some options nearpy implementation """ def __init__(self, b=16): self.params = {"method": "product quantization, numpy", 'b': b} def fit(self, X): b = self.params['b'] self.n, self.f = X.shape # Use NearPy lsh for fast ann rbp = RandomBinaryProjections('rbp', b) self.engine = Engine(self.f, lshashes=[rbp]) for i in np.arange(self.n): v = np.squeeze(np.copy(X[i, :])) self.engine.store_vector(v, i) def _get_one_knn(self, v, k=3): v = np.squeeze(np.copy(v)) vl = v.shape if vl[0] != self.f: # print(vl) raise Exception("Data Not Match") N = self.engine.neighbours(v) nni = -np.ones(k, dtype='int') nnd = np.empty(k) nnd[:] = np.nan for i in np.arange(k): try: nni[i] = N[i][1] nnd[i] = N[i][2] except IndexError: break return (nni, nnd) def get_knn(self, x, k=3): self.n, self.f = x.shape nni = -np.ones((self.n, k), dtype='int') nnd = np.empty((self.n, k)) nnd[:] = np.nan for i in np.arange(self.n): i_i, i_d = self._get_one_knn(x[i, :], k) nni[i, :] = i_i nnd[i, :] = i_d return (nni, nnd)
class LSHRandomProjectionsIndex: def __init__(self, num_features, projection_count=30): self.num_features = num_features self.rbp = RandomBinaryProjections('default', projection_count) self.text_engine = Engine(num_features, lshashes=[self.rbp], distance=CosineDistance()) def index(self, vector, key): if len(vector) != self.num_features: print("ERROR received vector.dim: " + str(len(vector)) + " on engine.dim: " + str(self.num_features)) raise Exception self.text_engine.store_vector(vector, key) def query(self, vector): res = self.text_engine.neighbours(vector) return res
def lshSearch(dataBase_, query_, k): featureNum_ = len(dataBase_) dimension_ = len(dataBase_[0]) rbp_ = RandomBinaryProjections('rbp', 30) engine_ = Engine(dimension_, lshashes=[rbp_], vector_filters=[NearestFilter(k)]) for i in range(featureNum_): v_ = dataBase_[i] engine_.store_vector(v_, '{}'.format(i)) N_ = engine_.neighbours(query_, distance='euclidean') index_ = [int(x[1]) for x in N_] return index_
class DB: def __init__(self, feature_size=16, nearest_neighbours=1000): self.feature_size = feature_size self.nn = nearest_neighbours self.engine = None self.load_hashmap() def load_hashmap(self): # Create redis storage adapter # need to start redis service redis_object = Redis(host='localhost', port=6379, db=14) redis_storage = RedisStorage(redis_object) try: config = redis_storage.load_hash_configuration('test') lshash = RandomBinaryProjections(None, None) lshash.apply_config(config) except: # Config is not existing, create hash from scratch, with 10 projections lshash = RandomBinaryProjections('test', 10) nearest = NearestFilter(self.nn) # self.engine = Engine(feature_size, lshashes=[], vector_filters=[]) self.engine = Engine(self.feature_size, lshashes=[lshash], vector_filters=[nearest], storage=redis_storage, distance=CosineDistance()) # Do some stuff like indexing or querying with the engine... # Finally store hash configuration in redis for later use redis_storage.store_hash_configuration(lshash) def query(self, fvector): query = np.asarray(fvector) # get nn nearest neighbours # a list of tuple (data, name, distance) N = self.engine.neighbours(query) return N def append_to_DB(self, fvector, name=""): if fvector is None: return self.engine.store_vector(np.asarray(fvector), name)
def test_sparse(): dim = 500 num_train = 1000 num_test = 1 train_data = ss.rand(dim, num_train)#pickle.load('/home/jmahler/Downloads/feature_objects.p') test_data = ss.rand(dim, num_test) rbp = RandomBinaryProjections('rbp', 10) engine = Engine(dim, lshashes=[rbp]) for i in range(num_train): engine.store_vector(train_data.getcol(i)) for j in range(num_test): N = engine.neighbours(test_data.getcol(j)) print N IPython.embed()
def nearest_neighbour(self, fname): """ Finds the "n_no" of nearest neighbours for each Query Question and writes it in a file "fname" given as a parameter. """ rbp = RandomBinaryProjections('rbp', self.n_no) # Create a random binary hash with 10 bits engine = Engine(self.dim, lshashes=[rbp]) # Create engine with pipeline configuration qout = open(fname, "w") for i in range(self.train_size + 1): engine.store_vector(np.transpose(self.vectors[i]), i) for i in range(self.train_size + 1, self.q_total): N = engine.neighbours(np.transpose(self.vectors[i])) qout.write(self.q_actual[i]) for j in range(len(N)): qout.write("NN %d --- " % (j + 1) + self.q_actual[N[j][1]]) qout.close()
def test_sparse(): dim = 500 num_train = 1000 num_test = 1 train_data = ss.rand( dim, num_train) #pickle.load('/home/jmahler/Downloads/feature_objects.p') test_data = ss.rand(dim, num_test) rbp = RandomBinaryProjections('rbp', 10) engine = Engine(dim, lshashes=[rbp]) for i in range(num_train): engine.store_vector(train_data.getcol(i)) for j in range(num_test): N = engine.neighbours(test_data.getcol(j)) print N IPython.embed()
def debug(): # Dimension of our vector space dimension = 500 # Create a random binary hash with 10 bits rbp = RandomBinaryProjections('rbp', 10) # Create engine with pipeline configuration engine = Engine(dimension, lshashes=[rbp]) # Index 1000000 random vectors (set their data to a unique string) for index in range(100000): v = numpy.random.randn(dimension) engine.store_vector(v, 'data_%d' % index) # Create random query vector query = numpy.random.randn(dimension) # Get nearest neighbours N = engine.neighbours(query)
class lshsearcher: def __init__(self): self.__dimension = None self.__engine_perm = None self.__permutations = None def _set_confval(self, dimension=None): if dimension is None: return None else: self.__dimension = dimension def _engine_on(self): # Create permutations meta-hash self.__permutations = HashPermutations('permut') # Create binary hash as child hash rbp_perm = RandomBinaryProjections('rbp_perm', 14) rbp_conf = {'num_permutation':50,'beam_size':10,'num_neighbour':100} # Add rbp as child hash of permutations hash self.__permutations.add_child_hash(rbp_perm, rbp_conf) # Create engine self.__engine_perm = Engine(self.__dimension, lshashes=[self.__permutations], distance=CosineDistance()) def conf(self, dimension): self._set_confval(dimension) self._engine_on() def getData(self, v): if self.__engine_perm is not None: self.__engine_perm.store_vector(v) def commitData(self): if self.__permutations is not None: self.__permutations.build_permuted_index() def find(self, v): if self.__engine_perm is not None: return self.__engine_perm.neighbours(v)
def main(argv): parser = argparse.ArgumentParser(prog='INDEX') parser.add_argument('source', help='path to the source metadata file') parser.add_argument('--hash-size', help='Hash size.', type=int, default=10) parser.add_argument('--num-tables', help='Number of tables.', type=int, default=5) parser.add_argument('--query-index', help='Index to use for query.', type=int, default=0) args = parser.parse_args(argv[1:]) # read in the data file data = pandas.read_csv(args.source, sep='\t') # Create a random binary hash with 10 bits rbp = RandomBinaryProjections('rbp', 10) # Create engine with pipeline configuration engine = Engine(len(data['features'][0].split(',')), lshashes=[rbp], distance=EuclideanDistance()) # indexing for i in range(0, len(data)): engine.store_vector( np.asarray(data['features'][i].split(',')).astype('float64'), data['filename'][i]) # query a vector q_vec response = engine.neighbours( np.asarray( data['features'][args.query_index].split(',')).astype('float64')) pprint(response)
class scLSH(object): def __init__(self, x): self.n, self.f = x.shape # Use NearPy lsh for fast ann rbp = RandomBinaryProjections('rbp', 10) self.engine = Engine(self.f, lshashes=[rbp]) for i in np.arange(self.n): v = x[i, :] self.engine.store_vector(v, i) def get_one_knn(self, v, k=3): vl = v.shape if vl[0] != self.f: print(vl) raise Exception("Data Not Match") N = self.engine.neighbours(v) nni = -np.ones(k, dtype='int') nnd = np.empty(k) nnd[:] = np.nan for i in np.arange(k): try: nni[i] = N[i][1] nnd[i] = N[i][2] except IndexError: break return (nni, nnd) def get_knn(self, x, k=3): self.n, self.f = x.shape nni = -np.ones((self.n, k), dtype='int') nnd = np.empty((self.n, k)) nnd[:] = np.nan for i in np.arange(self.n): i_i, i_d = self.get_one_knn(x[i, :]) nni[i, :] = i_i nnd[i, :] = i_d return (nni, nnd)
class GraphStateQueryIndex: def __init__(self): redis_object = redis.Redis(host='localhost', port=6379, db=0) redis_storage = RedisStorage(redis_object) # Get hash config from redis config = redis_storage.load_hash_configuration('MyHash') if config is None: # Config is not existing, create hash from scratch, with 5 projections self.lshash = RandomBinaryProjections('MyHash', 5) else: # Config is existing, create hash with None parameters self.lshash = RandomBinaryProjections(None, None) # Apply configuration loaded from redis self.lshash.apply_config(config) # print("HERE") # Create engine for feature space of 100 dimensions and use our hash. # This will set the dimension of the lshash only the first time, not when # using the configuration loaded from redis. Use redis storage to store # buckets. self.engine = Engine(4, lshashes=[self.lshash], storage=redis_storage) redis_storage.store_hash_configuration(self.lshash) def findMatch(self, v): matches = self.engine.neighbours(v) return matches def addVector(self, v, trainingText): self.engine.store_vector(v, trainingText) def clearIndex(self): self.engine.clean_all_buckets() def clearHashInstance(self, name): self.engine.clean_buckets(name)
def example2(): # Dimension of feature space DIM = 100 # Number of data points (dont do too much because of exact search) POINTS = 20000 ########################################################## print 'Performing indexing with HashPermutations...' t0 = time.time() # Create permutations meta-hash permutations = HashPermutations('permut') # Create binary hash as child hash rbp_perm = RandomBinaryProjections('rbp_perm', 14) rbp_conf = {'num_permutation':50,'beam_size':10,'num_neighbour':100} # Add rbp as child hash of permutations hash permutations.add_child_hash(rbp_perm, rbp_conf) # Create engine engine_perm = Engine(DIM, lshashes=[permutations], distance=CosineDistance()) # First index some random vectors matrix = numpy.zeros((POINTS,DIM)) for i in xrange(POINTS): v = numpy.random.randn(DIM) matrix[i] = v engine_perm.store_vector(v) # Then update permuted index permutations.build_permuted_index() t1 = time.time() print 'Indexing took %f seconds' % (t1-t0) # Get random query vector query = numpy.random.randn(DIM) # Do random query on engine 3 print '\nNeighbour distances with HashPermutations:' print ' -> Candidate count is %d' % engine_perm.candidate_count(query) results = engine_perm.neighbours(query) dists = [x[2] for x in results] print dists # Real neighbours print '\nReal neighbour distances:' query = query.reshape((1,DIM)) dists = CosineDistance().distance_matrix(matrix,query) dists = dists.reshape((-1,)) dists = sorted(dists) print dists[:10] ########################################################## print '\nPerforming indexing with HashPermutationMapper...' t0 = time.time() # Create permutations meta-hash permutations2 = HashPermutationMapper('permut2') # Create binary hash as child hash rbp_perm2 = RandomBinaryProjections('rbp_perm2', 14) # Add rbp as child hash of permutations hash permutations2.add_child_hash(rbp_perm2) # Create engine engine_perm2 = Engine(DIM, lshashes=[permutations2], distance=CosineDistance()) # First index some random vectors matrix = numpy.zeros((POINTS,DIM)) for i in xrange(POINTS): v = numpy.random.randn(DIM) matrix[i] = v engine_perm2.store_vector(v) t1 = time.time() print 'Indexing took %f seconds' % (t1-t0) # Get random query vector query = numpy.random.randn(DIM) # Do random query on engine 4 print '\nNeighbour distances with HashPermutationMapper:' print ' -> Candidate count is %d' % engine_perm2.candidate_count(query) results = engine_perm2.neighbours(query) dists = [x[2] for x in results] print dists # Real neighbours print '\nReal neighbour distances:' query = query.reshape((1,DIM)) dists = CosineDistance().distance_matrix(matrix,query) dists = dists.reshape((-1,)) dists = sorted(dists) print dists[:10] ########################################################## print '\nPerforming indexing with mutliple binary hashes...' t0 = time.time() hashes = [] for k in range(20): hashes.append(RandomBinaryProjections('rbp_%d' % k, 10)) # Create engine engine_rbps = Engine(DIM, lshashes=hashes, distance=CosineDistance()) # First index some random vectors matrix = numpy.zeros((POINTS,DIM)) for i in xrange(POINTS): v = numpy.random.randn(DIM) matrix[i] = v engine_rbps.store_vector(v) t1 = time.time() print 'Indexing took %f seconds' % (t1-t0) # Get random query vector query = numpy.random.randn(DIM) # Do random query on engine 4 print '\nNeighbour distances with mutliple binary hashes:' print ' -> Candidate count is %d' % engine_rbps.candidate_count(query) results = engine_rbps.neighbours(query) dists = [x[2] for x in results] print dists # Real neighbours print '\nReal neighbour distances:' query = query.reshape((1,DIM)) dists = CosineDistance().distance_matrix(matrix,query) dists = dists.reshape((-1,)) dists = sorted(dists) print dists[:10]
class testing_suite: """ Class to test SDF files in a nearest neighbor lookup format, under different models of representation such as PCA, FactorAnalysis, KernelPCA with the rbf kernel, FastICA, and DictionaryLearning Sample Usage: test=testing_suite() test.adddir("/mnt/terastation/shape_data/Cat50_ModelDatabase/screwdriver") num_train=12 num_test=4 test.make_train_test(num_train,num_test) accuracy,results=test.perform_PCA_tests() """ def __init__(self): self.PCA_changed_=True self.FA_changed_=True self.KPCA_changed_=True self.FICA_changed_=True self.DL_changed_=True self.all_files_=[] self.PCA_=None self.FA_ = None self.KPCA_ = None self.FICA_ = None self.DL_ = [] self.testing_=[] self.training_=[] self.engine_=[] self.training_vectors_=None self.confusion_={} self.biggest=0 def adddir(self,dir_to_add): """ add all sdf filepaths from a root directory tree (dir_to_add) to the all_files_ instance variable """ sdf_files = [] for root,dirs,files in walk(dir_to_add): for file_ in files: if file_.endswith("25.sdf"): sdf_files.append(path.join(root,file_)) self.all_files_+=sdf_files def adddir_25(self,dir_to_add): """add files in a directory only with dimension 12""" sdf_files = [] for root,dirs,files in walk(dir_to_add): for file_ in files: if file_.endswith(".sdf"): tempsdf=SDF(path.join(root,file_)) if tempsdf.dimensions()[0]==25*25*25: sdf_files.append(path.join(root,file_)) self.all_files_+=sdf_files def addfile(self,file_to_add): """add only one file to all_files""" self.all_files_.append(file_to_add) def make_train_test(self,num_train, num_test): """ populates the list of training files and testing files with filepaths based on a random number generator seeded with np.random.seed(100) Sample Usage: test=testing_suite() test.adddir("/mnt/terastation/shape_data/Cat50_ModelDatabase/screwdriver") num_train=12 num_test=4 test.make_train_test(num_train,num_test) """ assert num_train+num_test<=len(self.all_files_) np.random.seed(100) permuted_indices = np.random.permutation(len(self.all_files_)) get_training = itemgetter(*permuted_indices[:num_train]) get_testing = itemgetter(*permuted_indices[num_train:num_train+num_test]) if num_train > 1: self.training_ = get_training(self.all_files_) else: self.training_= [get_training(self.all_files_)] if num_test > 1: self.testing_ = get_testing(self.all_files_) else: self.testing_ = [get_testing(self.all_files_)] def normalize_vector(self,vector,largest_dimension): """normalizes smaller sdf vectors to a larger size by vertical stacking a column of zeros underneath""" return np.vstack((vector,np.zeros((largest_dimension-vector.shape[0],1)))) def get_PCA_training_vectors(self): """ gets all training_vectors from the set of training files, normalizes them using normalize vector and adds them all to a numpy array that gets returned """ training_sdf=[SDF(i) for i in list(self.training_)] self.biggest=0 for item in training_sdf: self.biggest=max(self.biggest,item.dimensions()[0]) return_train_vectors=None for tempsdf in training_sdf: vectorized=np.reshape(tempsdf.data(),(tempsdf.dimensions()[0],1)) normal_vector=self.normalize_vector(vectorized,self.biggest) if return_train_vectors==None: return_train_vectors=normal_vector else: return_train_vectors=np.concatenate((return_train_vectors,normal_vector),axis=1) return return_train_vectors """ -any function begining with make creates the sklearn.decomposition framework for the specified decomposition type -any function begining with fit fits the training vectors to the decomposition framework -any function begining with transform transforms the training vectors based on the fitted decomposition framework """ def render_sdf(self, a, thresh = 1e-3): h = plt.figure() ax = h.add_subplot(111, projection = '3d') surface_points = np.where(np.abs(a) < thresh) x = surface_points[0] y = surface_points[1] z = surface_points[2] ax.scatter(x, y, z) ax.set_xlabel('X') ax.set_ylabel('Y') ax.set_zlabel('Z') ax.set_xlim3d(0,a.shape[0]) ax.set_ylim3d(0,a.shape[1]) ax.set_zlim3d(0,a.shape[2]) plt.show() def make_PCA(self): self.PCA_=skdec.PCA()#n_components='mle') def fit_PCA(self,training_vectors): self.PCA_.fit(training_vectors) def make_FA(self): self.FA_=skdec.FactorAnalysis(n_components=len(list(self.training_))) def fit_FA(self,training_vectors): self.FA_.fit(training_vectors) def make_KPCA(self,kernel_option="rbf"): self.KPCA_=skdec.KernelPCA(gamma=0.1, kernel=kernel_option) def fit_KPCA(self,training_vectors): self.KPCA_.fit(training_vectors) def make_FICA(self): self.FICA_=skdec.FastICA(n_components=len(list(self.training_))) def fit_FICA(self,training_vectors): self.FICA_.fit(training_vectors) def make_DL(self,alpha_values): self.DL_.append(skdec.DictionaryLearning(n_components=len(list(self.training_)),alpha= alpha_values,transform_algorithm = 'omp')) def fit_DL(self,training_vectors): self.DL_[-1].fit(training_vectors) def load_PCA(self,vector_set): """reinitializes our engine and loads a numpy set of vectors of dimension (self.biggest,1) into self.engine_""" rbp = RandomBinaryProjections('rbp', 10) self.engine_ = Engine(self.PCA_.components_.shape[1], lshashes=[rbp]) transformed_vectors = self.PCA_.transform(vector_set.T) for i in range(len(list(self.training_))): #vector=vector_set[:,i] #vector=np.reshape(vector,(self.biggest,1)) #vector=self.PCA_.transform(vector) self.engine_.store_vector(transformed_vectors[i,:], self.training_[i]) def load_FA(self,vector_set): rbp = RandomBinaryProjections('rbp',10) self.engine_ = Engine(self.biggest, lshashes=[rbp]) for i in range(len(list(self.training_))): vector=vector_set[:,i] vector=np.reshape(vector,(self.biggest,1)) vector=self.FA_.transform(vector) self.engine_.store_vector(vector[:,0],self.training_[i]) def load_KPCA(self,vector_set): rbp = RandomBinaryProjections('rbp',10) self.engine_ = Engine(self.KPCA_.alphas_.shape[1], lshashes=[rbp]) transformed_vectors = self.KPCA_.transform(vector_set.T) for i in range(len(list(self.training_))): #vector=vector_set[:,i] #vector=np.reshape(vector,(self.biggest,1)) #vector=self.KPCA_.transform(vector) self.engine_.store_vector(transformed_vectors[i,:], self.training_[i]) def load_FICA(self,vector_set): rbp = RandomBinaryProjections('rbp',10) self.engine_ = Engine(self.biggest, lshashes=[rbp]) for i in range(len(list(self.training_))): vector=vector_set[:,i] vector=np.reshape(vector,(self.biggest,1)) vector=self.FICA_.transform(vector) self.engine_.store_vector(vector[:,0],self.training_[i]) def load_DL(self,vector_set): rbp = RandomBinaryProjections('rbp',10) self.engine_ = Engine(self.biggest, lshashes=[rbp]) for i in range(len(list(self.training_))): vector=vector_set[:,i] vector=np.reshape(vector,(self.biggest,1)) vector=self.DL_[-1].transform(vector) self.engine_.store_vector(vector[:,0],self.training_[i]) def engine_query(self,test_vector): """ queries the engine with a (self.biggest,1) dimension vector and returns the file_names of nearest neighbors and the results """ #print test_vector #reshaped=np.reshape(test_vector,(self.biggest,1)) results = self.engine_.neighbours(test_vector.T) file_names = [i[1] for i in results] return file_names, results def setup_confusion(self): """ reinitializes the self.confusion_ confusion matrix variable """ self.confusion_={} self.confusion_[UNKNOWN_TAG] = {} for file_ in self.all_files_: category = cat50_file_category(file_) self.confusion_[category] = {} for query_cat in self.confusion_.keys(): for pred_cat in self.confusion_.keys(): self.confusion_[query_cat][pred_cat] = 0 """ Makes a test vector by taking in an SDF, reshaping it, normalizing it, then returns a transformed version of that vector based on the corresponding decomposition model that was already trained """ def make_test_vector(self,sdf_array,vector_type): if vector_type=="PCA": return self.make_PCA_test_vector(sdf_array) elif vector_type=="FA": return self.make_FA_test_vector(sdf_array) elif vector_type=="KPCA": return self.make_KPCA_test_vector(sdf_array) elif vector_type=="FICA": return self.make_FICA_test_vector(sdf_array) elif vector_type=="DL": return self.make_DL_test_vector(sdf_array) def make_DL_test_vector(self,sdf_array): reshaped=np.reshape(sdf_array.data(),(sdf_array.dimensions()[0],1)) normalized=self.normalize_vector(reshaped,self.biggest) return self.DL_[-1].transform(normalized)[:,0] def make_FICA_test_vector(self,sdf_array): reshaped=np.reshape(sdf_array.data(),(sdf_array.dimensions()[0],1)) normalized=self.normalize_vector(reshaped,self.biggest) return self.FICA_.transform(normalized)[:,0] def make_KPCA_test_vector(self,sdf_array): reshaped=np.reshape(sdf_array.data(),(sdf_array.dimensions()[0],1)) return self.KPCA_.transform(reshaped.T) # reshaped=np.reshape(sdf_array.data(),(sdf_array.dimensions()[0],1)) # normalized=self.normalize_vector(reshaped,self.biggest) # return self.KPCA_.transform(normalized)[:,0] def make_FA_test_vector(self,sdf_array): reshaped=np.reshape(sdf_array.data(),(sdf_array.dimensions()[0],1)) normalized=self.normalize_vector(reshaped,self.biggest) return self.FA_.transform(normalized)[:,0] def make_PCA_test_vector(self,sdf_array): reshaped=np.reshape(sdf_array.data(),(sdf_array.dimensions()[0],1)) return self.PCA_.transform(reshaped.T) # IPython.embed() # normalized=self.normalize_vector(reshaped,self.biggest) # return self.PCA_.transform(normalized)[:,0] """ querys the loaded and trained engine with each of your test vectors from make_train_test Returns accuracy: float representing the accuracy of querying the nearpy engine with the test results test_results: dictionary of the results from the "testing" for each of the sdf_files """ def perform_tests(self,K,test_type): test_results={} for file_ in list(self.testing_): query_category=cat50_file_category(file_) print "Querying: %s with category %s "%(file_, query_category) converted = SDF(file_) test_vector=self.make_test_vector(converted,test_type) closest_names, closest_vals=self.engine_query(test_vector.T[:,0]) pred_category=UNKNOWN_TAG if len(closest_names)>0: closest_category=closest_names[0] pred_category=cat50_file_category(closest_category) for i in range(1,min(K,len(closest_names))): closest_category = closest_names[i] potential_category = cat50_file_category(closest_category) if potential_category == query_category: pred_category = potential_category print "Result Category: %s"%(pred_category) self.confusion_[query_category][pred_category] += 1 test_results[file_]= [(closest_names, closest_vals)] row_names=self.confusion_.keys() confusion_mat=np.zeros([len(row_names),len(row_names)]) i=0 for query_cat in self.confusion_.keys(): j = 0 for pred_cat in self.confusion_.keys(): confusion_mat[i,j] = self.confusion_[query_cat][pred_cat] j += 1 i += 1 # get true positives, etc for each category num_preds = len(self.testing_) tp = np.diag(confusion_mat) fp = np.sum(confusion_mat, axis=0) - np.diag(confusion_mat) fn = np.sum(confusion_mat, axis=1) - np.diag(confusion_mat) tn = num_preds * np.ones(tp.shape) - tp - fp - fn # compute useful statistics recall = tp / (tp + fn) tnr = tn / (fp + tn) precision = tp / (tp + fp) npv = tn / (tn + fn) fpr = fp / (fp + tn) accuracy = np.sum(tp) / num_preds # correct predictions over entire dataset # remove nans recall[np.isnan(recall)] = 0 tnr[np.isnan(tnr)] = 0 precision[np.isnan(precision)] = 0 npv[np.isnan(npv)] = 0 fpr[np.isnan(fpr)] = 0 return accuracy, test_results, recall, tnr, precision,npv,fpr def vis_pca_components(self, num_comp_vis, thresh = 0.01, method = 'PCA'): PCA = self.PCA_ if method == 'KPCA': PCA = self.KPCA_ num_components = PCA.components_.shape[0] num_components = min(num_comp_vis, num_components) comp_per_dim = int(math.ceil(math.sqrt(num_components))) h = plt.figure() for i in range(num_components): ax = h.add_subplot(comp_per_dim, comp_per_dim, i+1, projection = '3d') components = PCA.components_[i,:] comp_grid = components.reshape(25, 25, 25) surface_points = np.where(np.abs(comp_grid) < thresh) x = surface_points[0] y = surface_points[1] z = surface_points[2] ax.scatter(x, y, z) ax.set_xlabel('X') ax.set_ylabel('Y') ax.set_zlabel('Z') ax.set_xlim3d(0,25) ax.set_ylim3d(0,25) ax.set_zlim3d(0,25) ax.set_title('Component %d'%(i)) plt.show() def vis_pca_component_slices(self, num_comp_vis, method = 'PCA'): PCA = self.PCA_ if method == 'KPCA': PCA = self.KPCA_ num_components = PCA.components_.shape[0] num_components = min(num_comp_vis, num_components) comp_per_dim = int(math.ceil(math.sqrt(num_components))) plt.figure() for i in range(num_components): plt.subplot(comp_per_dim, comp_per_dim, i+1) components = PCA.components_[i,:] comp_grid = components.reshape(25, 25, 25) comp_slice = comp_grid[:,:,12] plt.imshow(comp_slice) plt.title('Component %d XY Plane'%(i)) plt.figure() for i in range(num_components): plt.subplot(comp_per_dim, comp_per_dim, i+1) components = PCA.components_[i,:] comp_grid = components.reshape(25, 25, 25) comp_slice = comp_grid[:,12,:] plt.imshow(comp_slice) plt.title('Component %d XZ Plane'%(i)) plt.figure() for i in range(num_components): plt.subplot(comp_per_dim, comp_per_dim, i+1) components = PCA.components_[i,:] comp_grid = components.reshape(25, 25, 25) comp_slice = comp_grid[12,:,:] plt.imshow(comp_slice) plt.title('Component %d YZ Plane'%(i)) plt.show() """ runs perform_tests on a specific type of decomposition after creating that decomposition type framework with the training vectors and loading those training vectors into the engine K is the number of neighbors to check """ def perform_PCA_tests(self,K): train_vectors=self.get_PCA_training_vectors() self.make_PCA() print 'Fitting PCA' self.fit_PCA(train_vectors.T) print 'Loading PCA' self.load_PCA(train_vectors) print 'Setup confusion' self.setup_confusion() print 'Eval accuracy' #IPython.embed() accuracy,test_results, recall, tnr, precision,npv,fpr=self.perform_tests(K,"PCA") return accuracy,test_results, recall, tnr, precision,npv,fpr def perform_FA_tests(self,K): train_vectors=self.get_PCA_training_vectors() self.make_FA() self.fit_FA(train_vectors) self.load_FA(train_vectors) self.setup_confusion() accuracy,test_results, recall, tnr, precision,npv,fpr=self.perform_tests(K,"FA") return accuracy,test_results, recall, tnr, precision,npv,fpr def perform_KPCA_tests(self,K,kernel="rbf"): train_vectors=self.get_PCA_training_vectors() self.make_KPCA(kernel_option=kernel) print 'Fitting KCPA' self.fit_KPCA(train_vectors.T) print 'Loading KPCA' self.load_KPCA(train_vectors) self.setup_confusion() accuracy,test_results, recall, tnr, precision,npv,fpr=self.perform_tests(K,"KPCA") IPython.embed() return accuracy,test_results, recall, tnr, precision,npv,fpr def perform_FICA_tests(self,K): train_vectors=self.get_PCA_training_vectors() self.make_FICA() self.fit_FICA(train_vectors) self.load_FICA(train_vectors) self.setup_confusion() accuracy,test_results, recall, tnr, precision,npv,fpr=self.perform_tests(K,"FICA") return accuracy,test_results, recall, tnr, precision,npv,fpr def perform_DL_tests(self,K,alpha): train_vectors=self.get_PCA_training_vectors() self.make_DL(alpha_values=alpha) self.fit_DL(train_vectors) self.load_DL(train_vectors) self.setup_confusion() accuracy,test_results, recall, tnr, precision,npv,fpr=self.perform_tests(K,"DL") return accuracy,test_results, recall, tnr, precision,npv,fpr def get_engine(self): return self.engine_ def get_PCA(self): return self.PCA_ def get_FA(self): return self.FA_ def get_KPCA(self): return self.KPCA_ def get_FICA(self): return self.FICA_ def get_DL(self): return self.DL_ def get_explained_variance_ratio(self): return self.PCA_.explained_variance_ratio_
class LSHSearch: def __init__(self, feature_file, dimension, neighbour, lsh_project_num): self.feature_file = feature_file self.dimension = dimension self.neighbour = neighbour self.face_feature = defaultdict(str) self.ground_truth = defaultdict(int) # Create permutations meta-hash permutations2 = HashPermutationMapper('permut2') tmp_feature = defaultdict(str) with open(feature_file, 'rb') as f: reader = csv.reader(f, delimiter=' ') for name, feature in reader: tmp_feature[name] = feature matrix = [] label = [] for item in tmp_feature.keys(): v = map(float, tmp_feature[item].split(',')) matrix.append(np.array(v)) label.append(item) random.shuffle(matrix) print 'PCA matric : ', len(matrix) rbp_perm2 = PCABinaryProjections('testPCABPHash', lsh_project_num, matrix) permutations2.add_child_hash(rbp_perm2) # Create engine nearest = NearestFilter(self.neighbour) self.engine = Engine(self.dimension, lshashes=[permutations2], distance=CosineDistance(), vector_filters=[nearest]) def build(self): with open(self.feature_file, 'rb') as f: reader = csv.reader(f, delimiter=' ') for name, feature in reader: self.face_feature[name] = feature person = '_'.join(name.split('_')[:-1]) self.ground_truth[person] += 1 for item in self.face_feature.keys(): v = map(float, self.face_feature[item].split(',')) self.engine.store_vector(v, item) def query(self, person_list): dists = [] scores = [] for person in person_list: query = map(float, self.face_feature[person].split(',')) print '\nNeighbour distances with mutliple binary hashes:' print ' -> Candidate count is %d' % self.engine.candidate_count(query) results = self.engine.neighbours(query) dists = dists + [x[1] for x in results] scores = scores + [x[2] for x in results] t_num = [self.ground_truth['_'.join(x.split('_')[:-1])] for x in dists] res = zip(dists, scores, t_num) res.sort(key = lambda t: t[1]) res1 = self.f7(res, person_list) return res1[:self.neighbour] def true_num(self, person): return self.ground_truth[person] def f7(self, zip_seq, person_list): seen = set() seen_add = seen.add return [ x for x in zip_seq if not (x[0] in seen or seen_add(x[0]) or x[0] in person_list)]
class LSHSearch: def __init__(self, feature_file, dimension, neighbour, lsh_project_num): self.feature_file = feature_file self.dimension = dimension self.neighbour = neighbour self.face_feature = defaultdict(str) self.ground_truth = defaultdict(int) # Create permutations meta-hash self.permutations2 = HashPermutationMapper('permut2') tmp_feature = defaultdict(str) with open(feature_file, 'rb') as f: reader = csv.reader(f, delimiter=' ') for name, feature in reader: tmp_feature[name] = feature matrix = [] label = [] for item in tmp_feature.keys(): v = map(float, tmp_feature[item].split(',')) matrix.append(np.array(v)) label.append(item) random.shuffle(matrix) print 'PCA matric : ', len(matrix) rbp_perm2 = PCABinaryProjections( 'testPCABPHash', lsh_project_num, matrix) self.permutations2.add_child_hash(rbp_perm2) # Create engine nearest = NearestFilter(self.neighbour) self.engine = Engine( self.dimension, lshashes=[self.permutations2], distance=CosineDistance(), vector_filters=[nearest]) def build(self): with open(self.feature_file, 'rb') as f: reader = csv.reader(f, delimiter=' ') for name, feature in reader: self.face_feature[name] = feature person = '_'.join(name.split('_')[:-1]) self.ground_truth[person] += 1 for item in self.face_feature.keys(): v = map(float, self.face_feature[item].split(',')) self.engine.store_vector(v, item) def update(self, person, feature): print feature v = map(float, feature.split(',')) epoch_time = long(time.time()) f_name = person + '_' + str(epoch_time) print f_name self.engine.store_vector(v, f_name) def query(self, person_feature): dists = [] scores = [] query = map(float, person_feature.split(',')) # print '\nNeighbour distances with mutliple binary hashes:' # print ' -> Candidate count is %d' % self.engine.candidate_count(query) results = self.engine.neighbours(query) dists = dists + [x[1] for x in results] scores = scores + [x[2] for x in results] res = zip(dists, scores) res.sort(key=lambda t: t[1]) return res[:self.neighbour]
class RMAX_repr(Representation): """ Identical to Tabular representation (ie assigns a binary feature function f_{d}() to each possible discrete state *d* in the domain, with f_{d}(s) = 1 when d=s, 0 elsewhere. HOWEVER, unlike *Tabular*, feature functions are only created for *s* which have been encountered in the domain, not instantiated for every single state at the outset. """ def __init__(self, domain, Rmax, LQ, k = 1, epsilon_d = 0.01): # LQ is the lipschitz constant - 10**3 according to the paper (by Cross Validn) self.LQ = LQ self.gamma = domain.discount_factor self.rmax = Rmax self.qmax = Rmax / (1-self.gamma) self.qmax_tilda = Rmax + self.gamma * self.qmax self.epsilon = epsilon_d # Approximate k-NN is used when finding the Q value of a point self.k = k # We also keep track of the states sampled so far self.sample_list = [0]*(2*100000) self.list_idx = 0 # And a dictionary for quick lookups of already computed values self.sample_values = {} # And we use an LSH to find the approximate k-Nearest neighbours # by training it on every s, a, r, s' tuple we see self.init_randomization() super( RMAX_repr, self).__init__( domain) def init_randomization(self): rbp = RandomBinaryProjections('rbp', 10) from nearpy.distances import ChebyshevDistance self.engine = Engine(7, lshashes = [rbp], vector_filters=[NearestFilter(self.k)], distance=ChebyshevDistance()) def is_known(self, s, a): # A s, a pair is 'known' if LQ * d(s, a, s', a') < epsilon_d indices = self.approx_nn(s, a) if not indices: return False for idx in indices: s_p, a_p = self.sample_list[idx] if self.LQ * self.d(s, a, s_p, a_p) > self.epsilon: return False return True def pre_discover(self, s, p_terminal, a, r, ns, terminal): # In the learning stage, if sa is not 'known' add it to the sample list # and its value to sample value. if not self.is_known(s, a): x = r + self.gamma * max(self.Q_tilda(ns, a_p) for a_p in range(self.actions_num)) self.engine.store_vector(np.append(s, a), self.list_idx) self.sample_list[self.list_idx]= (s, a) self.list_idx+=1 self.sample_values[self.sa_tuple(s, a)] = x #self.LSH.partial_fit(np.append(s, a)) super(RMAX_repr, self).pre_discover(s, p_terminal, a, ns, terminal) # Compute a distance metric between (s, a) and (ns, na). # Using max-norm as in the paper for now. def d(self, s, a, ns, na): # Create one big s,a array sa = np.append(s, a) nsa = np.append(ns, na) # Use scipy to compute the chebyshev distance => Max norm return distance(sa, nsa) def approx_nn(self, s, a): #dist, indices = self.LSH.kneighbors(np.append(s, a)) # returns a list of l = self.engine.neighbours(np.append(s, a)) indices = [elem[1] for elem in l] return indices def sa_tuple(self, s, a): return tuple(np.append(s, a)) # The approximate Q function def Q_tilda(self, s, a): k = self.k q = 0.0 # First get the k-nearest sampled neighbours to this point using LSH indices = self.approx_nn(s, a) num_neighbors = 0 for index in indices: sj, aj = self.sample_list[index] dij = self.d(s, a, sj, aj) if dij <= (self.qmax / self.LQ): xj = self.sample_values[self.sa_tuple(sj, aj)] q += dij * self.LQ + xj num_neighbors += 1 # In case there were less than k neighbors - Use Qmax_tilda for the remaining for i in range(num_neighbors, k): q += self.qmax_tilda # Return the average Q return q/k def Qs(self, s, terminal, phi_s=None): # Q -> Array of Q(s, a) values for this state # A -> Corresponding IDs # Before any learning is done, the experiment calls the policy to # estimate prior performance. In that case, the LSHF would throw a # Value Error. We pre-empt that here Q = np.zeros((self.actions_num)) #try : # self.LSH.kneighbors(np.append(s, 0)) #except ValueError: # return Q for a in range(self.actions_num): Q[a] = self.Q_tilda(s, a) return Q
class TestRandomBinaryProjectionTree(unittest.TestCase): def setUp(self): self.memory = MemoryStorage() self.redis_object = Redis(host='localhost', port=6379, db=0) self.redis_storage = RedisStorage(self.redis_object) def test_retrieval(self): # We want 12 projections, 20 results at least rbpt = RandomBinaryProjectionTree('testHash', 12, 20) # Create engine for 100 dimensional feature space, do not forget to set # nearest filter to 20, because default is 10 self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)]) # First insert 200000 random vectors #print 'Indexing...' for k in range(200000): x = numpy.random.randn(100) x_data = 'data' self.engine.store_vector(x, x_data) # Now do random queries and check result set size #print 'Querying...' for k in range(10): x = numpy.random.randn(100) n = self.engine.neighbours(x) #print "Candidate count = %d" % self.engine.candidate_count(x) #print "Result size = %d" % len(n) self.assertEqual(len(n), 20) def test_storage_memory(self): # We want 10 projections, 20 results at least rbpt = RandomBinaryProjectionTree('testHash', 10, 20) # Create engine for 100 dimensional feature space self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)]) # First insert 2000 random vectors for k in range(2000): x = numpy.random.randn(100) x_data = 'data' self.engine.store_vector(x, x_data) self.memory.store_hash_configuration(rbpt) rbpt2 = RandomBinaryProjectionTree(None, None, None) rbpt2.apply_config(self.memory.load_hash_configuration('testHash')) self.assertEqual(rbpt.dim, rbpt2.dim) self.assertEqual(rbpt.hash_name, rbpt2.hash_name) self.assertEqual(rbpt.projection_count, rbpt2.projection_count) for i in range(rbpt.normals.shape[0]): for j in range(rbpt.normals.shape[1]): self.assertEqual(rbpt.normals[i, j], rbpt2.normals[i, j]) # Now do random queries and check result set size for k in range(10): x = numpy.random.randn(100) keys1 = rbpt.hash_vector(x, querying=True) keys2 = rbpt2.hash_vector(x, querying=True) self.assertEqual(len(keys1), len(keys2)) for k in range(len(keys1)): self.assertEqual(keys1[k], keys2[k]) def test_storage_redis(self): # We want 10 projections, 20 results at least rbpt = RandomBinaryProjectionTree('testHash', 10, 20) # Create engine for 100 dimensional feature space self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)]) # First insert 2000 random vectors for k in range(2000): x = numpy.random.randn(100) x_data = 'data' self.engine.store_vector(x, x_data) self.redis_storage.store_hash_configuration(rbpt) rbpt2 = RandomBinaryProjectionTree(None, None, None) rbpt2.apply_config(self.redis_storage.load_hash_configuration('testHash')) self.assertEqual(rbpt.dim, rbpt2.dim) self.assertEqual(rbpt.hash_name, rbpt2.hash_name) self.assertEqual(rbpt.projection_count, rbpt2.projection_count) for i in range(rbpt.normals.shape[0]): for j in range(rbpt.normals.shape[1]): self.assertEqual(rbpt.normals[i, j], rbpt2.normals[i, j]) # Now do random queries and check result set size for k in range(10): x = numpy.random.randn(100) keys1 = rbpt.hash_vector(x, querying=True) keys2 = rbpt2.hash_vector(x, querying=True) self.assertEqual(len(keys1), len(keys2)) for k in range(len(keys1)): self.assertEqual(keys1[k], keys2[k])
for next_read_line in f: next_read_line = next_read_line.rstrip() split_arr = next_read_line.split(" ") split_arr = split_arr[1:] split_arr = list(map(float, split_arr)) vector = numpy.asarray(split_arr) if (i == 639): query = vector # print (query) else: vec_data = numpy.append(vector, i) engine.store_vector(vector, tuple(vec_data)) i += 1 # Get nearest neighbors: N = engine.neighbours(query) # Number of nearest neighbors: print(len(N)) print("Nearest Neighbors") for x in N: #Printing the id of the vector here as needed: # print (x[1][dimension]) print(x[1][dimension])
def example1(): # Dimension of feature space DIM = 100 # Number of data points (dont do too much because of exact search) POINTS = 10000 print 'Creating engines' # We want 12 projections, 20 results at least rbpt = RandomBinaryProjectionTree('rbpt', 20, 20) # Create engine 1 engine_rbpt = Engine(DIM, lshashes=[rbpt], distance=CosineDistance()) # Create binary hash as child hash rbp = RandomBinaryProjections('rbp1', 20) # Create engine 2 engine = Engine(DIM, lshashes=[rbp], distance=CosineDistance()) # Create permutations meta-hash permutations = HashPermutations('permut') # Create binary hash as child hash rbp_perm = RandomBinaryProjections('rbp_perm', 20) rbp_conf = {'num_permutation':50,'beam_size':10,'num_neighbour':100} # Add rbp as child hash of permutations hash permutations.add_child_hash(rbp_perm, rbp_conf) # Create engine 3 engine_perm = Engine(DIM, lshashes=[permutations], distance=CosineDistance()) # Create permutations meta-hash permutations2 = HashPermutationMapper('permut2') # Create binary hash as child hash rbp_perm2 = RandomBinaryProjections('rbp_perm2', 12) # Add rbp as child hash of permutations hash permutations2.add_child_hash(rbp_perm2) # Create engine 3 engine_perm2 = Engine(DIM, lshashes=[permutations2], distance=CosineDistance()) print 'Indexing %d random vectors of dimension %d' % (POINTS, DIM) # First index some random vectors matrix = numpy.zeros((POINTS,DIM)) for i in xrange(POINTS): v = numpy.random.randn(DIM) matrix[i] = v engine.store_vector(v) engine_rbpt.store_vector(v) engine_perm.store_vector(v) engine_perm2.store_vector(v) print 'Buckets 1 = %d' % len(engine.storage.buckets['rbp1'].keys()) print 'Buckets 2 = %d' % len(engine_rbpt.storage.buckets['rbpt'].keys()) print 'Building permuted index for HashPermutations' # Then update permuted index permutations.build_permuted_index() print 'Generate random data' # Get random query vector query = numpy.random.randn(DIM) # Do random query on engine 1 print '\nNeighbour distances with RandomBinaryProjectionTree:' print ' -> Candidate count is %d' % engine_rbpt.candidate_count(query) results = engine_rbpt.neighbours(query) dists = [x[2] for x in results] print dists # Do random query on engine 2 print '\nNeighbour distances with RandomBinaryProjections:' print ' -> Candidate count is %d' % engine.candidate_count(query) results = engine.neighbours(query) dists = [x[2] for x in results] print dists # Do random query on engine 3 print '\nNeighbour distances with HashPermutations:' print ' -> Candidate count is %d' % engine_perm.candidate_count(query) results = engine_perm.neighbours(query) dists = [x[2] for x in results] print dists # Do random query on engine 4 print '\nNeighbour distances with HashPermutations2:' print ' -> Candidate count is %d' % engine_perm2.candidate_count(query) results = engine_perm2.neighbours(query) dists = [x[2] for x in results] print dists # Real neighbours print '\nReal neighbour distances:' query = query.reshape((1,DIM)) dists = CosineDistance().distance_matrix(matrix,query) dists = dists.reshape((-1,)) dists = sorted(dists) print dists[:10]