Ejemplo n.º 1
0
class TestEngine(unittest.TestCase):

    def setUp(self):
        self.engine = Engine(1000)

    def test_retrieval(self):
        for k in range(100):
            self.engine.clean_all_buckets()
            x = numpy.random.randn(1000)
            x_data = 'data'
            self.engine.store_vector(x, x_data)
            n = self.engine.neighbours(x)
            y = n[0][0]
            y_data = n[0][1]
            y_distance = n[0][2]
            self.assertTrue((y == x).all())
            self.assertEqual(y_data, x_data)
            self.assertEqual(y_distance, 0.0)

    def test_retrieval_sparse(self):
        for k in range(100):
            self.engine.clean_all_buckets()
            x = scipy.sparse.rand(1000, 1, density=0.05)
            x_data = 'data'
            self.engine.store_vector(x, x_data)
            n = self.engine.neighbours(x)
            y = n[0][0]
            y_data = n[0][1]
            y_distance = n[0][2]
            self.assertTrue((y - x).sum() == 0.0)
            self.assertEqual(y_data, x_data)
            self.assertEqual(y_distance, 0.0)
Ejemplo n.º 2
0
class TestEngine(unittest.TestCase):
    def setUp(self):
        self.engine = Engine(1000)

    def test_retrieval(self):
        for k in range(100):
            self.engine.clean_all_buckets()
            x = numpy.random.randn(1000)
            x_data = 'data'
            self.engine.store_vector(x, x_data)
            n = self.engine.neighbours(x)
            y = n[0][0]
            y_data = n[0][1]
            y_distance = n[0][2]
            self.assertTrue((y == x).all())
            self.assertEqual(y_data, x_data)
            self.assertEqual(y_distance, 0.0)

    def test_retrieval_sparse(self):
        for k in range(100):
            self.engine.clean_all_buckets()
            x = scipy.sparse.rand(1000, 1, density=0.05)
            x_data = 'data'
            self.engine.store_vector(x, x_data)
            n = self.engine.neighbours(x)
            y = n[0][0]
            y_data = n[0][1]
            y_distance = n[0][2]
            self.assertTrue((y - x).sum() == 0.0)
            self.assertEqual(y_data, x_data)
            self.assertEqual(y_distance, 0.0)
Ejemplo n.º 3
0
class NearPy(NearestNeighbor):
    def __init__(self, dist=EuclideanDistance(), phi=lambda x: x):
        NearestNeighbor.__init__(self, dist, phi)

    def _create_engine(self, k, lshashes=None):
        self.k_ = k
        self.engine_ = Engine(self.dimension_, lshashes,
                              distance=self.dist_metric_,
                              vector_filters=[NearestFilter(k)])

        for i, feature in enumerate(self.featurized_):
            if self.transpose_:
                self.engine_.store_vector(feature.T, i)
            else:
                self.engine_.store_vector(feature, i)

    def train(self, data, k=10):
        self.data_ = np.array(data)
        self.featurized_ = self.featurize(data)

        shape = featurized[0].shape
        assert len(shape) <= 2, 'Feature shape must be (1, N), (N, 1), or (N,)'
        if len(shape) == 1:
            self.transpose_ = False
            self.dimension_ = shape[0]
        else:
            assert 1 in shape, 'Feature shape must be (1, N) or (N, 1)'
            self.transpose_ = (shape[0] == 1)
            self.dimension_ = shape[1] if self.transpose_ else shape[0]

        logging.info('Constructing nearest neighbor data structure.')
        train_start = time.clock()
        self._create_engine(k)
        train_end = time.clock()
#        logging.info('Took %f sec' %(train_end - train_start))

    def within_distance(x, dist=0.5, return_indices=False):
        raise NotImplementedError

    def nearest_neighbors(self, x, k, return_indices=False):
        # HACK: load all data back into new engine if k doesn't match
        if k != self.k_:
            self._create_engine(k)

        feature = self.phi_(x)
        if self.transpose_:
            query_result = self.engine_.neighbours(feature.T)
        else:
            query_result = self.engine_.neighbours(feature)

        if len(query_result) == 0:
            return [], []

        features, indices, distances = zip(*query_result)
        if return_indices:
            return list(indices), list(distances)
        else:
            indices = np.array(indices)
            return list(self.data_[indices]), list(distances)
Ejemplo n.º 4
0
class TestPermutation(unittest.TestCase):
    def setUp(self):
        logging.basicConfig(level=logging.WARNING)

        # Create permutations meta-hash
        self.permutations = HashPermutations('permut')

        # Create binary hash as child hash
        rbp = RandomBinaryProjections('rbp1', 4)
        rbp_conf = {
            'num_permutation': 50,
            'beam_size': 10,
            'num_neighbour': 100
        }

        # Add rbp as child hash of permutations hash
        self.permutations.add_child_hash(rbp, rbp_conf)

        # Create engine with meta hash and cosine distance
        self.engine_perm = Engine(200,
                                  lshashes=[self.permutations],
                                  distance=CosineDistance())

        # Create engine without permutation meta-hash
        self.engine = Engine(200, lshashes=[rbp], distance=CosineDistance())

    def test_runnable(self):

        # First index some random vectors
        matrix = numpy.zeros((1000, 200))
        for i in xrange(1000):
            v = numpy.random.randn(200)
            matrix[i] = v
            self.engine.store_vector(v)
            self.engine_perm.store_vector(v)

        # Then update permuted index
        self.permutations.build_permuted_index()

        # Do random query on engine with permutations meta-hash
        print '\nNeighbour distances with permuted index:'
        query = numpy.random.randn(200)
        results = self.engine_perm.neighbours(query)
        dists = [x[2] for x in results]
        print dists

        # Do random query on engine without permutations meta-hash
        print '\nNeighbour distances without permuted index (distances should be larger):'
        results = self.engine.neighbours(query)
        dists = [x[2] for x in results]
        print dists

        # Real neighbours
        print '\nReal neighbour distances:'
        query = query.reshape((1, 200))
        dists = CosineDistance().distance_matrix(matrix, query)
        dists = dists.reshape((-1, ))
        dists = sorted(dists)
        print dists[:10]
Ejemplo n.º 5
0
class TestPermutation(unittest.TestCase):

    def setUp(self):
        logging.basicConfig(level=logging.WARNING)

        # Create permutations meta-hash
        self.permutations = HashPermutations('permut')

        # Create binary hash as child hash
        rbp = RandomBinaryProjections('rbp1', 4)
        rbp_conf = {'num_permutation':50,'beam_size':10,'num_neighbour':100}

        # Add rbp as child hash of permutations hash
        self.permutations.add_child_hash(rbp, rbp_conf)

        # Create engine with meta hash and cosine distance
        self.engine_perm = Engine(200, lshashes=[self.permutations], distance=CosineDistance())

        # Create engine without permutation meta-hash
        self.engine = Engine(200, lshashes=[rbp], distance=CosineDistance())

    def test_runnable(self):

        # First index some random vectors
        matrix = numpy.zeros((1000,200))
        for i in xrange(1000):
            v = numpy.random.randn(200)
            matrix[i] = v
            self.engine.store_vector(v)
            self.engine_perm.store_vector(v)

        # Then update permuted index
        self.permutations.build_permuted_index()

        # Do random query on engine with permutations meta-hash
        print '\nNeighbour distances with permuted index:'
        query = numpy.random.randn(200)
        results = self.engine_perm.neighbours(query)
        dists = [x[2] for x in results]
        print dists

        # Do random query on engine without permutations meta-hash
        print '\nNeighbour distances without permuted index (distances should be larger):'
        results = self.engine.neighbours(query)
        dists = [x[2] for x in results]
        print dists

        # Real neighbours
        print '\nReal neighbour distances:'
        query = query.reshape((1,200))
        dists = CosineDistance().distance_matrix(matrix,query)
        dists = dists.reshape((-1,))
        dists = sorted(dists)
        print dists[:10]
Ejemplo n.º 6
0
class TestPermutation(unittest.TestCase):
    def setUp(self):
        logging.basicConfig(level=logging.WARNING)
        numpy.random.seed(11)

        # Create permutations meta-hash
        self.permutations = HashPermutations('permut')

        # Create binary hash as child hash
        rbp = RandomBinaryProjections('rbp1', 4, rand_seed=19)
        rbp_conf = {
            'num_permutation': 50,
            'beam_size': 10,
            'num_neighbour': 100
        }

        # Add rbp as child hash of permutations hash
        self.permutations.add_child_hash(rbp, rbp_conf)

        # Create engine with meta hash and cosine distance
        self.engine_perm = Engine(200,
                                  lshashes=[self.permutations],
                                  distance=CosineDistance())

        # Create engine without permutation meta-hash
        self.engine = Engine(200, lshashes=[rbp], distance=CosineDistance())

    def test_runnable(self):

        # First index some random vectors
        matrix = numpy.zeros((1000, 200))
        for i in xrange(1000):
            v = numpy.random.randn(200)
            matrix[i] = v
            self.engine.store_vector(v)
            self.engine_perm.store_vector(v)

        # Then update permuted index
        self.permutations.build_permuted_index()

        # Do random query on engine with permutations meta-hash
        query = numpy.random.randn(200)
        results = self.engine_perm.neighbours(query)
        permuted_dists = [x[2] for x in results]

        # Do random query on engine without permutations meta-hash (distances
        # should be larger):'
        results = self.engine.neighbours(query)
        dists = [x[2] for x in results]

        self.assertLess(permuted_dists[0], dists[0])
Ejemplo n.º 7
0
class RandomBinaryNN(NearestNeighbor):
    """ Nearest neighbor implementation by using random binary trees from nearpy package """
    def __init__(self, dimension: int, number_projections: int,
                 threshold: float):
        """
        :param dimension:
            Number of dimensions of input points
        :param number_projections:
            Number of random projections used for finding nearest neighbors.
            Trade-off: More projections result in a smaller number of false positives in candidate set
        :param threshold:
            Distance threshold for definition nearest: all points within this specific distance
        """
        self.rbp = RandomBinaryProjections('rbp', number_projections)
        self.sqdist = SquaredEuclideanDistance()
        self.ann_engine = Engine(
            dimension,
            lshashes=[self.rbp],
            distance=self.sqdist,
            vector_filters=[DistanceThresholdFilter(threshold)])

    def insert_candidate(self, point: np.ndarray, metadata):
        self.ann_engine.store_vector(point, data=metadata)

    def get_candidates(self, point: np.ndarray):
        return [
            NearestNeighborResult(res[0], res[1], res[2])
            for res in self.ann_engine.neighbours(point)
        ]
def test_nearpy(X_train, y_train, X_test, k):
    # We are looking for the k closest neighbours
    nearest = NearestFilter(k)
    X_train_normalized = []
    for i in range(len(X_train)):
        train_example = X_train[i]
        element = ((train_example / np.linalg.norm(train_example)).tolist(),
                   y_train[i].tolist())
        X_train_normalized.append(element)

    engine = Engine(X_train.shape[1],
                    lshashes=[RandomBinaryProjections('default', 10)],
                    distance=CosineDistance(),
                    vector_filters=[nearest])

    #perform hashing for train examples
    for train_example in X_train:
        engine.store_vector(train_example)

    labels = []
    for test_example in X_test:
        neighbors = engine.neighbours(test_example)
        labels.append([
            train_example[1] for train_example in X_train_normalized
            if set(neighbors[0][0]) == set(train_example[0])
        ])
    return labels
Ejemplo n.º 9
0
def knn(data,k):
    assert k<=len(data)-1, 'The number of neighbors must be smaller than the data cardinality (minus one)'
    k=k+1
    n,dimension = data.shape
    ind = []
    dist = []
    

    if(dimension<10):
        rbp = RandomBinaryProjections('rbp', dimension)
    else:
        rbp = RandomBinaryProjections('rbp',10)
        
    engine = Engine(dimension, lshashes=[rbp], vector_filters=[NearestFilter(k)])

    for i in range(n):
        engine.store_vector(data[i], i)
    
    
    for i in range(n):
     
        N = engine.neighbours(data[i])
        ind.append([x[1] for x in N][1:])
        dist.append([x[2] for x in N][1:])
        
  
    return N,dist,ind
Ejemplo n.º 10
0
class LSH:
    def __init__(self, path, dataSize):
        self.path = path
        self.dataSize = dataSize

    def preprocess(self):
        ids = []
        meta = []
        data = []

        for i in range(self.dataSize):
            with open(self.path + str(i) + ".data", "rb") as file:
                f_song_id = pickle.load(file)
                f_songMeta = pickle.load(file)
                f_data = pickle.load(file)
                ids.append(f_song_id)
                meta.append(f_songMeta)
                data.append(f_data)

        self.id = np.array(ids)
        self.meta = np.array(meta)
        self.data = np.array(data)

    def generate_hashtable(self):
        self.engine = Engine(self.data.shape[1],
                             lshashes=[RandomBinaryProjections('rbp', 20)])

        for i in range(self.dataSize):
            self.engine.store_vector(self.data[i], data=self.id[i])

    def query(self, data):
        return self.engine.neighbours(data)
Ejemplo n.º 11
0
class StateDBEngine(object):
    def __init__(self):
        # initialize "nearby" library
        self.dim = 4
        self.rbp = RandomBinaryProjections('rbp', 100)
        self.engine = Engine(self.dim, lshashes=[self.rbp])
        # performance counter
        self.counter = 0

    def add(self, x, data):
        # print 'add data = ', data
        self.engine.store_vector(x, data)
        self.counter += 1

    def lookup(self, x, THRESHOLD=0.1):
        naver = self.engine.neighbours(x)
        if len(naver) == 0:
            return None

        pt, data, d = naver[0]
        # print 'lhs, rhs', x, pt,
        # print 'd = ', d, (d < THRESHOLD), (data is None)
        if d < THRESHOLD:
            return data
        else:
            return None
Ejemplo n.º 12
0
class PointCalculator():
    def __init__(self, point_list, point):
        self.__configure_calculator(point_list, point)

    def __configure_calculator(self, point_list, point):
        # Dimension of our vector space
        self.__dimension__ = 2

        # Create a random binary hash with 10 bits
        self.__rbp__ = RandomBinaryProjections('rbp', 10)

        # Create engine with pipeline configuration
        self.__engine__ = Engine(self.__dimension__, lshashes=[self.__rbp__])
        self.set_searching_point_list(point_list)
        self.set_query_point(point)

    def __load_point_list_in_engine(self):
        for index in xrange(0, len(self.__point_list__)):
            v = numpy.array(self.__point_list__[index])
            self.__engine__.store_vector(v, 'data_%d' % index)

    def set_searching_point_list(self, point_list):
        self.__point_list__ = point_list
        self.__load_point_list_in_engine()

    def set_query_point(self, point):
        self.__point__ = point

    def __get_nearest_point(self):
        return self.__engine__.neighbours(numpy.array(self.__point__))

    def get_nearest_point_array_coords(self):
        nearest_point = self.__get_nearest_point()
        return [nearest_point[0][0][0], nearest_point[0][0][1]]
Ejemplo n.º 13
0
class ImageSimilarity():
    def __init__(self, distanceMeasure="EuclideanDistance"):
        self.res_similar = ResnetSimilarity()
        dimension = 2048
        rbp = RandomBinaryProjections('rbp', 10)
        self.engine = Engine(dimension, lshashes=[rbp])
        if distanceMeasure == "EuclideanDistance":
            self.filehandler = open("hashed_objects/hashed_object_euclidean.pkl", 'rb')
        elif distanceMeasure == "Test":
            self.filehandler = open("hashed_objects/hashed_object_example.pkl", 'rb')
        else:
            self.filehandler = open("hashed_objects/hashed_object_Cosine.pkl", 'rb')
        self.engine = pickle.load(self.filehandler)
        self.filehandler.close()
        print("Hash Table Loaded")

    def query(self, image):
        result = []
        image_emb = self.res_similar.getMapping(image)
        image_emb = image_emb.view(-1, 2048)
        image_emb = image_emb.numpy()

        N = self.engine.neighbours(image_emb[0])
        for i in range(len(N)):
            result.append(N[i][1])
            if i == 5:
                break

        return result

    def tearDown(self):
        self.filehandler.close()
Ejemplo n.º 14
0
def knn(data, k):
    assert k <= len(
        data
    ) - 1, 'The number of neighbors must be smaller than the data cardinality (minus one)'
    k = k + 1
    n, dimension = data.shape
    ind = []
    dist = []

    if (dimension < 10):
        rbp = RandomBinaryProjections('rbp', dimension)
    else:
        rbp = RandomBinaryProjections('rbp', 10)

    engine = Engine(dimension,
                    lshashes=[rbp],
                    vector_filters=[NearestFilter(k)])

    for i in range(n):
        engine.store_vector(data[i], i)

    for i in range(n):

        N = engine.neighbours(data[i])
        ind.append([x[1] for x in N][1:])
        dist.append([x[2] for x in N][1:])

    return N, dist, ind
Ejemplo n.º 15
0
        def RunAnnNearpy(q):
            totalTimer = Timer()

            # Load input dataset.
            Log.Info("Loading dataset", self.verbose)
            queryData = np.genfromtxt(self.dataset[1], delimiter=',')
            train, label = SplitTrainData(self.dataset)

            with totalTimer:
                # Get all the parameters.
                try:
                    # Perform Approximate Nearest-Neighbors
                    dimension = train.shape[1]
                    rbp = RandomBinaryProjections('rbp', 10)
                    engine = Engine(dimension, lshashes=[rbp])
                    for i in range(len(train)):
                        engine.store_vector(train[i], 'data_%d' % i)
                    for i in range(len(queryData)):
                        v = engine.neighbours(queryData[i])
                except Exception as e:
                    Log.Info(e)
                    q.put(e)
                    return -1
            time = totalTimer.ElapsedTime()
            q.put(time)
            return time
Ejemplo n.º 16
0
class TestEngine(unittest.TestCase):
    def setUp(self):
        self.engine = Engine(1000)

    def test_storage_issue(self):
        engine1 = Engine(100)
        engine2 = Engine(100)

        for k in range(1000):
            x = numpy.random.randn(100)
            x_data = 'data'
            engine1.store_vector(x, x_data)

        # Each engine should have its own default storage
        self.assertEqual(len(engine2.storage.buckets), 0)

    def test_retrieval(self):
        for k in range(100):
            self.engine.clean_all_buckets()
            x = numpy.random.randn(1000)
            x_data = 'data'
            self.engine.store_vector(x, x_data)
            n = self.engine.neighbours(x)
            y, y_data, y_distance = n[0]
            normalized_x = unitvec(x)
            delta = 0.000000001
            self.assertAlmostEqual(numpy.abs((normalized_x - y)).max(),
                                   0,
                                   delta=delta)
            self.assertEqual(y_data, x_data)
            self.assertAlmostEqual(y_distance, 0.0, delta=delta)

    def test_retrieval_sparse(self):
        for k in range(100):
            self.engine.clean_all_buckets()
            x = scipy.sparse.rand(1000, 1, density=0.05)
            x_data = 'data'
            self.engine.store_vector(x, x_data)
            n = self.engine.neighbours(x)
            y, y_data, y_distance = n[0]
            normalized_x = unitvec(x)
            delta = 0.000000001
            self.assertAlmostEqual(numpy.abs((normalized_x - y)).max(),
                                   0,
                                   delta=delta)
            self.assertEqual(y_data, x_data)
            self.assertAlmostEqual(y_distance, 0.0, delta=delta)
Ejemplo n.º 17
0
def main(args):
    """ Main entry.
    """

    data = Dataset(args.dataset)
    num, dim = data.base.shape

    # We are looking for the ten closest neighbours
    nearest = NearestFilter(args.topk)
    # We want unique candidates
    unique = UniqueFilter()

    # Create engines for all configurations
    for nbit, ntbl in itertools.product(args.nbits, args.ntbls):
        logging.info("Creating Engine ...")
        lshashes = [RandomBinaryProjections('rbp%d' % i, nbit)
                    for i in xrange(ntbl)]

        # Create engine with this configuration
        engine = Engine(dim, lshashes=lshashes,
                        vector_filters=[unique, nearest])
        logging.info("\tDone!")

        logging.info("Adding items ...")
        for i in xrange(num):
            engine.store_vector(data.base[i, :], i)
            if i % 100000 == 0:
                logging.info("\t%d/%d" % (i, data.nbae))
        logging.info("\tDone!")

        ids = np.zeros((data.nqry, args.topk), np.int)
        logging.info("Searching ...")
        tic()
        for i in xrange(data.nqry):
            reti = [y for x, y, z in
                    np.array(engine.neighbours(data.query[i]))]
            ids[i, :len(reti)] = reti
            if i % 100 == 0:
                logging.info("\t%d/%d" % (i, data.nqry))
        time_costs = toc()
        logging.info("\tDone!")

        report = os.path.join(args.exp_dir, "report.txt")
        with open(report, "a") as rptf:
            rptf.write("*" * 64 + "\n")
            rptf.write("* %s\n" % time.asctime())
            rptf.write("*" * 64 + "\n")

        r_at_k = compute_stats(data.groundtruth, ids, args.topk)[-1][-1]

        with open(report, "a") as rptf:
            rptf.write("=" * 64 + "\n")
            rptf.write("index_%s-nbit_%d-ntbl_%d\n" % ("NearPy", nbit, ntbl))
            rptf.write("-" * 64 + "\n")
            rptf.write("recall@%-8d%.4f\n" % (args.topk, r_at_k))
            rptf.write("time cost (ms): %.3f\n" %
                       (time_costs * 1000 / data.nqry))
Ejemplo n.º 18
0
class TestEngine(unittest.TestCase):

    def setUp(self):
        self.engine = Engine(1000)

    def test_storage_issue(self):
        engine1 = Engine(100)
        engine2 = Engine(100)

        for k in range(1000):
            x = numpy.random.randn(100)
            x_data = 'data'
            engine1.store_vector(x, x_data)

        # Each engine should have its own default storage
        self.assertTrue(len(engine2.storage.buckets)==0)

    def test_retrieval(self):
        for k in range(100):
            self.engine.clean_all_buckets()
            x = numpy.random.randn(1000)
            x_data = 'data'
            self.engine.store_vector(x, x_data)
            n = self.engine.neighbours(x)
            y = n[0][0]
            y_data = n[0][1]
            y_distance = n[0][2]
            self.assertTrue((y == x).all())
            self.assertEqual(y_data, x_data)
            self.assertEqual(y_distance, 0.0)

    def test_retrieval_sparse(self):
        for k in range(100):
            self.engine.clean_all_buckets()
            x = scipy.sparse.rand(1000, 1, density=0.05)
            x_data = 'data'
            self.engine.store_vector(x, x_data)
            n = self.engine.neighbours(x)
            y = n[0][0]
            y_data = n[0][1]
            y_distance = n[0][2]
            self.assertTrue((y - x).sum() == 0.0)
            self.assertEqual(y_data, x_data)
            self.assertEqual(y_distance, 0.0)
Ejemplo n.º 19
0
class TestEngine(unittest.TestCase):

    def setUp(self):
        self.engine = Engine(1000)

    def test_storage_issue(self):
        engine1 = Engine(100)
        engine2 = Engine(100)

        for k in range(1000):
            x = numpy.random.randn(100)
            x_data = 'data'
            engine1.store_vector(x, x_data)

        # Each engine should have its own default storage
        self.assertTrue(len(engine2.storage.buckets)==0)

    def test_retrieval(self):
        for k in range(100):
            self.engine.clean_all_buckets()
            x = numpy.random.randn(1000)
            x_data = 'data'
            self.engine.store_vector(x, x_data)
            n = self.engine.neighbours(x)
            y, y_data, y_distance  = n[0]
            normalized_x = unitvec(x)
            delta = 0.000000001
            self.assertAlmostEqual(numpy.abs((normalized_x - y)).max(), 0, delta=delta)
            self.assertEqual(y_data, x_data)
            self.assertAlmostEqual(y_distance, 0.0, delta=delta)

    def test_retrieval_sparse(self):
        for k in range(100):
            self.engine.clean_all_buckets()
            x = scipy.sparse.rand(1000, 1, density=0.05)
            x_data = 'data'
            self.engine.store_vector(x, x_data)
            n = self.engine.neighbours(x)
            y, y_data, y_distance = n[0]
            normalized_x = unitvec(x)
            delta = 0.000000001
            self.assertAlmostEqual(numpy.abs((normalized_x - y)).max(), 0, delta=delta)
            self.assertEqual(y_data, x_data)
            self.assertAlmostEqual(y_distance, 0.0, delta=delta)
Ejemplo n.º 20
0
def k_nn_lsh_2(k, word, decade_matrix, index_dict):
    num_rows = decade_matrix.get_shape()[0]
    print("the number of rows:" + str(num_rows))
    rbp = RandomBinaryProjections('rbp', 256)
    engine = Engine(num_rows, lshashes=[rbp])
    for i in range(num_rows):
        print(i)

        engine.store_vector(decade_matrix.getrow(i), "data_%d" % i)
    return engine.neighbours(word)
Ejemplo n.º 21
0
class TestEngine(unittest.TestCase):
    def setUp(self):
        self.engine = Engine(1000)

    def test_storage_issue(self):
        engine1 = Engine(100)
        engine2 = Engine(100)

        for k in range(1000):
            x = numpy.random.randn(100)
            x_data = 'data'
            engine1.store_vector(x, x_data)

        # Each engine should have its own default storage
        self.assertTrue(len(engine2.storage.buckets) == 0)

    def test_retrieval(self):
        for k in range(100):
            self.engine.clean_all_buckets()
            x = numpy.random.randn(1000)
            x_data = 'data'
            self.engine.store_vector(x, x_data)
            n = self.engine.neighbours(x)
            y = n[0][0]
            y_data = n[0][1]
            y_distance = n[0][2]
            self.assertTrue((y == x).all())
            self.assertEqual(y_data, x_data)
            self.assertEqual(y_distance, 0.0)

    def test_retrieval_sparse(self):
        for k in range(100):
            self.engine.clean_all_buckets()
            x = scipy.sparse.rand(1000, 1, density=0.05)
            x_data = 'data'
            self.engine.store_vector(x, x_data)
            n = self.engine.neighbours(x)
            y = n[0][0]
            y_data = n[0][1]
            y_distance = n[0][2]
            self.assertTrue((y - x).sum() == 0.0)
            self.assertEqual(y_data, x_data)
            self.assertEqual(y_distance, 0.0)
Ejemplo n.º 22
0
class CFiltering:
    def __init__(self,
                 matrix,
                 max_neighbours=20,
                 lshashes=[RandomBinaryProjections("rbp", 10)],
                 vector_filters=[UniqueFilter()],
                 distance=Pearson()):

        if not isinstance(lshashes, list):

            raise TypeError("'lshashes' must be an instance of 'list'")

        if not isinstance(vector_filters, list):

            raise TypeError("'vector_filters' must be an instance of 'list'")

        self.underlying = Engine(len(matrix[0]),
                                 lshashes=lshashes,
                                 vector_filters=vector_filters +
                                 [NearestFilter(max_neighbours)],
                                 distance=distance)

        for vector in matrix:

            self.underlying.store_vector(vector)

    def predict(self, vector, precision):

        neighbours = self.underlying.neighbours(vector)

        if not neighbours:

            raise ValueError("Failed to acquire any neighbours")

        average = [
            sum(neighbour) / len(neighbour) for neighbour, _, _ in neighbours
        ]

        avg = sum(vector) / len(vector)

        for i in range(len(vector)):

            if vector[i] < precision:

                weighted_sum = 0

                for j, neighbour in enumerate(neighbours):

                    neighbour, _, similarity = neighbour

                    weighted_sum += similarity * (neighbour[j] - average[j])

                vector[i] = avg + weighted_sum / len(vector)

        return vector
Ejemplo n.º 23
0
class Neighbors:
    """ Nearest neighbors. """
    def __init__(self, config, verbose=True, log_file=None):
        # set up logger
        self._logger = Logger.get_logger(self.__class__.__name__,
                                         log_file=log_file,
                                         silence=(not verbose),
                                         global_log_file=verbose)

        # read config
        self._parse_config(config)

        self._engine = None

    def _parse_config(self, config):
        self._num_neighbors = config["num_neighbors"]

    def _build_engine(self, dimension):
        # build NearPy engine
        self._logger.info("Building engine...")
        self._engine = Engine(
            dimension, vector_filters=[NearestFilter(self._num_neighbors)])

    def store(self, vectors, data=None, log_freq=10, verbose=True):
        self._logger.info("Storing vectors...")
        if data is not None:
            assert vectors.shape[0] == len(
                data), "Dim 0 of vectors and data must match!"

        if self._engine is None:
            self._build_engine(vectors.shape[-1])

        num_vectors = vectors.shape[0]
        for idx in xrange(num_vectors):
            if verbose and idx % log_freq == 0:
                self._logger.info("Storing vector {} of {}...".format(
                    idx, num_vectors))
            if data is not None:
                self._engine.store_vector(vectors[idx], data[idx])
            else:
                self._engine.store_vector(vectors[idx])

    def predict(self, vectors, log_freq=10, verbose=True):
        self._logger.info("Predicting...")

        num_vectors = vectors.shape[0]
        neighbors = []
        for idx in xrange(num_vectors):
            if verbose and idx % log_freq == 0:
                self._logger.info("Predicting vector {} of {}...".format(
                    idx, num_vectors))
            neighbors.append(self._engine.neighbours(vectors[idx]))
        return neighbors
Ejemplo n.º 24
0
class LSHIndex(Index):

    def __init__(self, hasher, number_of_tables=6, length_of_tables=12, match_thresh=0.2, association_thresh=0.1, storage=memoryStorage):
        """
        :param hasher:
        @type hasher: Hasher
        """
        Index.__init__(self, hasher,
                       number_of_tables=number_of_tables,
                       length_of_tables=length_of_tables,
                       match_thresh=match_thresh,
                       association_thresh=association_thresh)
        self.hasher = hasher
        self.match_thresh = match_thresh
        self.association_thresh = association_thresh
        self.tables = [None]*number_of_tables
        for i in range(number_of_tables):
            self.tables[i] = RandomBinaryProjections(str(i), length_of_tables)
        self.engine = Engine(self.hasher.dims(),
                             lshashes=self.tables,
                             storage=storage(),
                             fetch_vector_filters=[NoVectorFilter()])

    def index(self, id, img):
        item = self.hasher.hash(id, img)
        for i in range(len(item.descriptors)):
            self.engine.store_vector(item.descriptors[i],data=(id, item.keypoints[i], item.descriptors[i]))
        return item

    def find(self, id, img, index_if_not_found=False):
        item = self.hasher.hash(id, img)
        matches = {}
        #count_min =self.association_thresh * float(len(item.descriptors))
        for x in item.descriptors:
            for neighbour in self.engine.neighbours(x):
                if neighbour[1][0] in matches:
                    continue
                y = neighbour[1][2]
                dist = l2norm(x, y)
                key = neighbour[1][0]
                if dist < self.match_thresh:
                    #if dist > 0.0001:
                    #    print('{} {} {}'.format(id, neighbour[1][0], dist))
                    matches[key] = (matches[key] + 1) if key in matches else 1
        if id not in matches and index_if_not_found:
            for i in range(len(item.descriptors)):
                self.engine.store_vector(item.descriptors[i], data=(id, item.keypoints[i], item.descriptors[i]))
        #for id, count in matches.items():
        #    #if count >= count_min:
        #    yield id
        return list(matches.keys())
Ejemplo n.º 25
0
    def start(dataset, test_vector, num_nearest=5):

        # Create a random binary hash with 10 bits
        rbp = RandomBinaryProjections('rbp', 10)

        # Create engine with pipeline configuration
        engine = Engine(dataset.shape, lshashes=[rbp])

        # Index 1000000 random vectors (set their data to a unique string)
        for i, v in dataset:
            engine.store_vector(v, 'data_%d' % i)

        # Get nearest neighbours
        N = engine.neighbours(test_vector)
Ejemplo n.º 26
0
class lshNN(NNs):
    """
    Locality-sensitive hashing by random projection
        consider some options
    nearpy implementation
    """
    def __init__(self, b=16):
        self.params = {"method": "product quantization, numpy", 'b': b}

    def fit(self, X):
        b = self.params['b']
        self.n, self.f = X.shape
        # Use NearPy lsh for fast ann
        rbp = RandomBinaryProjections('rbp', b)

        self.engine = Engine(self.f, lshashes=[rbp])
        for i in np.arange(self.n):
            v = np.squeeze(np.copy(X[i, :]))
            self.engine.store_vector(v, i)

    def _get_one_knn(self, v, k=3):
        v = np.squeeze(np.copy(v))
        vl = v.shape
        if vl[0] != self.f:
            # print(vl)
            raise Exception("Data Not Match")
        N = self.engine.neighbours(v)
        nni = -np.ones(k, dtype='int')
        nnd = np.empty(k)
        nnd[:] = np.nan
        for i in np.arange(k):
            try:
                nni[i] = N[i][1]
                nnd[i] = N[i][2]
            except IndexError:
                break
        return (nni, nnd)

    def get_knn(self, x, k=3):
        self.n, self.f = x.shape
        nni = -np.ones((self.n, k), dtype='int')
        nnd = np.empty((self.n, k))
        nnd[:] = np.nan
        for i in np.arange(self.n):
            i_i, i_d = self._get_one_knn(x[i, :], k)
            nni[i, :] = i_i
            nnd[i, :] = i_d
        return (nni, nnd)
Ejemplo n.º 27
0
class LSHRandomProjectionsIndex:

    def __init__(self, num_features, projection_count=30):
        self.num_features = num_features
        self.rbp = RandomBinaryProjections('default', projection_count)
        self.text_engine = Engine(num_features, lshashes=[self.rbp], distance=CosineDistance())

    def index(self, vector, key):
        if len(vector) != self.num_features:
            print("ERROR received vector.dim: " + str(len(vector)) + " on engine.dim: " + str(self.num_features))
            raise Exception
        self.text_engine.store_vector(vector, key)

    def query(self, vector):
        res = self.text_engine.neighbours(vector)
        return res
Ejemplo n.º 28
0
def lshSearch(dataBase_, query_, k):
    featureNum_ = len(dataBase_)
    dimension_ = len(dataBase_[0])

    rbp_ = RandomBinaryProjections('rbp', 30)

    engine_ = Engine(dimension_,
                     lshashes=[rbp_],
                     vector_filters=[NearestFilter(k)])

    for i in range(featureNum_):
        v_ = dataBase_[i]
        engine_.store_vector(v_, '{}'.format(i))

    N_ = engine_.neighbours(query_, distance='euclidean')
    index_ = [int(x[1]) for x in N_]
    return index_
Ejemplo n.º 29
0
class DB:
    def __init__(self, feature_size=16, nearest_neighbours=1000):
        self.feature_size = feature_size
        self.nn = nearest_neighbours
        self.engine = None
        self.load_hashmap()

    def load_hashmap(self):
        # Create redis storage adapter
        # need to start redis service
        redis_object = Redis(host='localhost', port=6379, db=14)
        redis_storage = RedisStorage(redis_object)
        try:
            config = redis_storage.load_hash_configuration('test')
            lshash = RandomBinaryProjections(None, None)
            lshash.apply_config(config)

        except:
            # Config is not existing, create hash from scratch, with 10 projections
            lshash = RandomBinaryProjections('test', 10)

        nearest = NearestFilter(self.nn)
        # self.engine = Engine(feature_size, lshashes=[], vector_filters=[])
        self.engine = Engine(self.feature_size,
                             lshashes=[lshash],
                             vector_filters=[nearest],
                             storage=redis_storage,
                             distance=CosineDistance())

        # Do some stuff like indexing or querying with the engine...

        # Finally store hash configuration in redis for later use
        redis_storage.store_hash_configuration(lshash)

    def query(self, fvector):
        query = np.asarray(fvector)

        # get nn nearest neighbours
        # a list of tuple (data, name, distance)
        N = self.engine.neighbours(query)
        return N

    def append_to_DB(self, fvector, name=""):
        if fvector is None:
            return
        self.engine.store_vector(np.asarray(fvector), name)
Ejemplo n.º 30
0
def test_sparse():
    dim = 500
    num_train = 1000
    num_test = 1
    train_data = ss.rand(dim, num_train)#pickle.load('/home/jmahler/Downloads/feature_objects.p')
    test_data = ss.rand(dim, num_test)

    rbp = RandomBinaryProjections('rbp', 10)
    engine = Engine(dim, lshashes=[rbp])

    for i in range(num_train):
        engine.store_vector(train_data.getcol(i))

    for j in range(num_test):
        N = engine.neighbours(test_data.getcol(j))
        print N

    IPython.embed()
Ejemplo n.º 31
0
    def nearest_neighbour(self, fname):
        """
		Finds the "n_no" of nearest neighbours for each Query Question and writes it 
		in a file "fname" given as a parameter.
		"""
        rbp = RandomBinaryProjections('rbp', self.n_no)
        # Create a random binary hash with 10 bits
        engine = Engine(self.dim, lshashes=[rbp])
        # Create engine with pipeline configuration
        qout = open(fname, "w")
        for i in range(self.train_size + 1):
            engine.store_vector(np.transpose(self.vectors[i]), i)

        for i in range(self.train_size + 1, self.q_total):
            N = engine.neighbours(np.transpose(self.vectors[i]))
            qout.write(self.q_actual[i])
            for j in range(len(N)):
                qout.write("NN %d --- " % (j + 1) + self.q_actual[N[j][1]])
        qout.close()
Ejemplo n.º 32
0
def test_sparse():
    dim = 500
    num_train = 1000
    num_test = 1
    train_data = ss.rand(
        dim,
        num_train)  #pickle.load('/home/jmahler/Downloads/feature_objects.p')
    test_data = ss.rand(dim, num_test)

    rbp = RandomBinaryProjections('rbp', 10)
    engine = Engine(dim, lshashes=[rbp])

    for i in range(num_train):
        engine.store_vector(train_data.getcol(i))

    for j in range(num_test):
        N = engine.neighbours(test_data.getcol(j))
        print N

    IPython.embed()
Ejemplo n.º 33
0
    def debug():
        # Dimension of our vector space
        dimension = 500

        # Create a random binary hash with 10 bits
        rbp = RandomBinaryProjections('rbp', 10)

        # Create engine with pipeline configuration
        engine = Engine(dimension, lshashes=[rbp])

        # Index 1000000 random vectors (set their data to a unique string)
        for index in range(100000):
            v = numpy.random.randn(dimension)
            engine.store_vector(v, 'data_%d' % index)

        # Create random query vector
        query = numpy.random.randn(dimension)

        # Get nearest neighbours
        N = engine.neighbours(query)
Ejemplo n.º 34
0
class lshsearcher:
    def __init__(self):
        self.__dimension = None
        self.__engine_perm = None
        self.__permutations = None

    def _set_confval(self, dimension=None):
        if dimension is None:
            return None
        else:
            self.__dimension = dimension

    def _engine_on(self):
        # Create permutations meta-hash
        self.__permutations = HashPermutations('permut')

        # Create binary hash as child hash
        rbp_perm = RandomBinaryProjections('rbp_perm', 14)
        rbp_conf = {'num_permutation':50,'beam_size':10,'num_neighbour':100}

        # Add rbp as child hash of permutations hash
        self.__permutations.add_child_hash(rbp_perm, rbp_conf)

        # Create engine
        self.__engine_perm = Engine(self.__dimension, lshashes=[self.__permutations], distance=CosineDistance())

    def conf(self, dimension):
        self._set_confval(dimension)
        self._engine_on()

    def getData(self, v):
        if self.__engine_perm is not None:
            self.__engine_perm.store_vector(v)

    def commitData(self):
        if self.__permutations is not None:
            self.__permutations.build_permuted_index()

    def find(self, v):
        if self.__engine_perm is not None:
            return self.__engine_perm.neighbours(v)
Ejemplo n.º 35
0
def main(argv):
    parser = argparse.ArgumentParser(prog='INDEX')
    parser.add_argument('source', help='path to the source metadata file')
    parser.add_argument('--hash-size', help='Hash size.', type=int, default=10)
    parser.add_argument('--num-tables',
                        help='Number of tables.',
                        type=int,
                        default=5)
    parser.add_argument('--query-index',
                        help='Index to use for query.',
                        type=int,
                        default=0)

    args = parser.parse_args(argv[1:])

    # read in the data file
    data = pandas.read_csv(args.source, sep='\t')

    # Create a random binary hash with 10 bits
    rbp = RandomBinaryProjections('rbp', 10)

    # Create engine with pipeline configuration
    engine = Engine(len(data['features'][0].split(',')),
                    lshashes=[rbp],
                    distance=EuclideanDistance())

    # indexing
    for i in range(0, len(data)):
        engine.store_vector(
            np.asarray(data['features'][i].split(',')).astype('float64'),
            data['filename'][i])

    # query a vector q_vec
    response = engine.neighbours(
        np.asarray(
            data['features'][args.query_index].split(',')).astype('float64'))

    pprint(response)
Ejemplo n.º 36
0
class scLSH(object):
    def __init__(self, x):
        self.n, self.f = x.shape
        # Use NearPy lsh for fast ann
        rbp = RandomBinaryProjections('rbp', 10)

        self.engine = Engine(self.f, lshashes=[rbp])
        for i in np.arange(self.n):
            v = x[i, :]
            self.engine.store_vector(v, i)

    def get_one_knn(self, v, k=3):
        vl = v.shape
        if vl[0] != self.f:
            print(vl)
            raise Exception("Data Not Match")
        N = self.engine.neighbours(v)
        nni = -np.ones(k, dtype='int')
        nnd = np.empty(k)
        nnd[:] = np.nan
        for i in np.arange(k):
            try:
                nni[i] = N[i][1]
                nnd[i] = N[i][2]
            except IndexError:
                break
        return (nni, nnd)

    def get_knn(self, x, k=3):
        self.n, self.f = x.shape
        nni = -np.ones((self.n, k), dtype='int')
        nnd = np.empty((self.n, k))
        nnd[:] = np.nan
        for i in np.arange(self.n):
            i_i, i_d = self.get_one_knn(x[i, :])
            nni[i, :] = i_i
            nnd[i, :] = i_d
        return (nni, nnd)
Ejemplo n.º 37
0
class GraphStateQueryIndex:
    def __init__(self):
        redis_object = redis.Redis(host='localhost', port=6379, db=0)
        redis_storage = RedisStorage(redis_object)

        # Get hash config from redis
        config = redis_storage.load_hash_configuration('MyHash')

        if config is None:
            # Config is not existing, create hash from scratch, with 5 projections
            self.lshash = RandomBinaryProjections('MyHash', 5)
        else:
            # Config is existing, create hash with None parameters
            self.lshash = RandomBinaryProjections(None, None)
            # Apply configuration loaded from redis
            self.lshash.apply_config(config)
        # print("HERE")

        # Create engine for feature space of 100 dimensions and use our hash.
        # This will set the dimension of the lshash only the first time, not when
        # using the configuration loaded from redis. Use redis storage to store
        # buckets.
        self.engine = Engine(4, lshashes=[self.lshash], storage=redis_storage)
        redis_storage.store_hash_configuration(self.lshash)

    def findMatch(self, v):
        matches = self.engine.neighbours(v)
        return matches

    def addVector(self, v, trainingText):
        self.engine.store_vector(v, trainingText)

    def clearIndex(self):
        self.engine.clean_all_buckets()

    def clearHashInstance(self, name):
        self.engine.clean_buckets(name)
Ejemplo n.º 38
0
class NearPy(NearestNeighbor):
    def __init__(self, dist=EuclideanDistance(), phi=lambda x: x):
        NearestNeighbor.__init__(self, dist, phi)

    def _create_engine(self, k, lshashes=None):
        self.k_ = k
        self.engine_ = Engine(self.dimension_,
                              lshashes,
                              distance=self.dist_metric_,
                              vector_filters=[NearestFilter(k)])

        for i, feature in enumerate(self.featurized_):
            if self.transpose_:
                self.engine_.store_vector(feature.T, i)
            else:
                self.engine_.store_vector(feature, i)

    def train(self, data, k=10):
        self.data_ = np.array(data)
        self.featurized_ = self.featurize(data)

        shape = featurized[0].shape
        assert len(shape) <= 2, 'Feature shape must be (1, N), (N, 1), or (N,)'
        if len(shape) == 1:
            self.transpose_ = False
            self.dimension_ = shape[0]
        else:
            assert 1 in shape, 'Feature shape must be (1, N) or (N, 1)'
            self.transpose_ = (shape[0] == 1)
            self.dimension_ = shape[1] if self.transpose_ else shape[0]

        logging.info('Constructing nearest neighbor data structure.')
        train_start = time.clock()
        self._create_engine(k)
        train_end = time.clock()


#        logging.info('Took %f sec' %(train_end - train_start))

    def within_distance(x, dist=0.5, return_indices=False):
        raise NotImplementedError

    def nearest_neighbors(self, x, k, return_indices=False):
        # HACK: load all data back into new engine if k doesn't match
        if k != self.k_:
            self._create_engine(k)

        feature = self.phi_(x)
        if self.transpose_:
            query_result = self.engine_.neighbours(feature.T)
        else:
            query_result = self.engine_.neighbours(feature)

        if len(query_result) == 0:
            return [], []

        features, indices, distances = zip(*query_result)
        if return_indices:
            return list(indices), list(distances)
        else:
            indices = np.array(indices)
            return list(self.data_[indices]), list(distances)
Ejemplo n.º 39
0
def example2():

    # Dimension of feature space
    DIM = 100

    # Number of data points (dont do too much because of exact search)
    POINTS = 20000

    ##########################################################

    print 'Performing indexing with HashPermutations...'
    t0 = time.time()

    # Create permutations meta-hash
    permutations = HashPermutations('permut')

    # Create binary hash as child hash
    rbp_perm = RandomBinaryProjections('rbp_perm', 14)
    rbp_conf = {'num_permutation':50,'beam_size':10,'num_neighbour':100}

    # Add rbp as child hash of permutations hash
    permutations.add_child_hash(rbp_perm, rbp_conf)

    # Create engine
    engine_perm = Engine(DIM, lshashes=[permutations], distance=CosineDistance())

    # First index some random vectors
    matrix = numpy.zeros((POINTS,DIM))
    for i in xrange(POINTS):
        v = numpy.random.randn(DIM)
        matrix[i] = v
        engine_perm.store_vector(v)

    # Then update permuted index
    permutations.build_permuted_index()

    t1 = time.time()
    print 'Indexing took %f seconds' % (t1-t0)

    # Get random query vector
    query = numpy.random.randn(DIM)

    # Do random query on engine 3
    print '\nNeighbour distances with HashPermutations:'
    print '  -> Candidate count is %d' % engine_perm.candidate_count(query)
    results = engine_perm.neighbours(query)
    dists = [x[2] for x in results]
    print dists

    # Real neighbours
    print '\nReal neighbour distances:'
    query = query.reshape((1,DIM))
    dists = CosineDistance().distance_matrix(matrix,query)
    dists = dists.reshape((-1,))
    dists = sorted(dists)
    print dists[:10]

    ##########################################################

    print '\nPerforming indexing with HashPermutationMapper...'
    t0 = time.time()

    # Create permutations meta-hash
    permutations2 = HashPermutationMapper('permut2')

    # Create binary hash as child hash
    rbp_perm2 = RandomBinaryProjections('rbp_perm2', 14)

    # Add rbp as child hash of permutations hash
    permutations2.add_child_hash(rbp_perm2)

    # Create engine
    engine_perm2 = Engine(DIM, lshashes=[permutations2], distance=CosineDistance())

    # First index some random vectors
    matrix = numpy.zeros((POINTS,DIM))
    for i in xrange(POINTS):
        v = numpy.random.randn(DIM)
        matrix[i] = v
        engine_perm2.store_vector(v)

    t1 = time.time()
    print 'Indexing took %f seconds' % (t1-t0)

    # Get random query vector
    query = numpy.random.randn(DIM)

    # Do random query on engine 4
    print '\nNeighbour distances with HashPermutationMapper:'
    print '  -> Candidate count is %d' % engine_perm2.candidate_count(query)
    results = engine_perm2.neighbours(query)
    dists = [x[2] for x in results]
    print dists

    # Real neighbours
    print '\nReal neighbour distances:'
    query = query.reshape((1,DIM))
    dists = CosineDistance().distance_matrix(matrix,query)
    dists = dists.reshape((-1,))
    dists = sorted(dists)
    print dists[:10]

    ##########################################################

    print '\nPerforming indexing with mutliple binary hashes...'
    t0 = time.time()

    hashes = []
    for k in range(20):
        hashes.append(RandomBinaryProjections('rbp_%d' % k, 10))

    # Create engine
    engine_rbps = Engine(DIM, lshashes=hashes, distance=CosineDistance())

    # First index some random vectors
    matrix = numpy.zeros((POINTS,DIM))
    for i in xrange(POINTS):
        v = numpy.random.randn(DIM)
        matrix[i] = v
        engine_rbps.store_vector(v)

    t1 = time.time()
    print 'Indexing took %f seconds' % (t1-t0)

    # Get random query vector
    query = numpy.random.randn(DIM)

    # Do random query on engine 4
    print '\nNeighbour distances with mutliple binary hashes:'
    print '  -> Candidate count is %d' % engine_rbps.candidate_count(query)
    results = engine_rbps.neighbours(query)
    dists = [x[2] for x in results]
    print dists

    # Real neighbours
    print '\nReal neighbour distances:'
    query = query.reshape((1,DIM))
    dists = CosineDistance().distance_matrix(matrix,query)
    dists = dists.reshape((-1,))
    dists = sorted(dists)
    print dists[:10]
Ejemplo n.º 40
0
class testing_suite:
	"""
	Class to test SDF files in a nearest neighbor lookup format, under different models of representation 
	such as PCA, FactorAnalysis, KernelPCA with the rbf kernel, FastICA, and DictionaryLearning

	Sample Usage:

		test=testing_suite()
		test.adddir("/mnt/terastation/shape_data/Cat50_ModelDatabase/screwdriver")
		num_train=12
		num_test=4

		test.make_train_test(num_train,num_test)
		accuracy,results=test.perform_PCA_tests()
	"""

	def __init__(self):
		self.PCA_changed_=True
		self.FA_changed_=True
		self.KPCA_changed_=True
		self.FICA_changed_=True
		self.DL_changed_=True
		self.all_files_=[]
		self.PCA_=None
		self.FA_ = None
		self.KPCA_ = None
		self.FICA_ = None
		self.DL_ = []
		self.testing_=[]
		self.training_=[]
		self.engine_=[]
		self.training_vectors_=None
		self.confusion_={}
		self.biggest=0

	def adddir(self,dir_to_add):
		"""
			add all sdf filepaths from a root directory tree (dir_to_add) to the all_files_
			instance variable
		"""
		sdf_files = []
		for root,dirs,files in walk(dir_to_add):
			for file_ in files:
				if file_.endswith("25.sdf"):
					sdf_files.append(path.join(root,file_))
		self.all_files_+=sdf_files

	def adddir_25(self,dir_to_add):
		"""add files in a directory only with dimension 12"""
		sdf_files = []
		for root,dirs,files in walk(dir_to_add):
			for file_ in files:
				if file_.endswith(".sdf"):
					tempsdf=SDF(path.join(root,file_))
					if tempsdf.dimensions()[0]==25*25*25:
						sdf_files.append(path.join(root,file_))
		self.all_files_+=sdf_files

	def addfile(self,file_to_add):
		"""add only one file to all_files"""
		self.all_files_.append(file_to_add)

	def make_train_test(self,num_train, num_test):
		"""
		populates the list of training files and testing files with filepaths based on a random
		number generator seeded with np.random.seed(100)

		Sample Usage:
			test=testing_suite()
			test.adddir("/mnt/terastation/shape_data/Cat50_ModelDatabase/screwdriver")
			num_train=12
			num_test=4

			test.make_train_test(num_train,num_test)
		"""
		assert num_train+num_test<=len(self.all_files_)
		np.random.seed(100)
		permuted_indices = np.random.permutation(len(self.all_files_))
		get_training = itemgetter(*permuted_indices[:num_train])
		get_testing = itemgetter(*permuted_indices[num_train:num_train+num_test])
		if num_train > 1:
			self.training_ = get_training(self.all_files_)
		else:
			self.training_= [get_training(self.all_files_)]


		if num_test > 1:
			self.testing_ = get_testing(self.all_files_)
		else:
			self.testing_ = [get_testing(self.all_files_)]

	def normalize_vector(self,vector,largest_dimension):
		"""normalizes smaller sdf vectors to a larger size by vertical stacking a column of zeros underneath"""
		return np.vstack((vector,np.zeros((largest_dimension-vector.shape[0],1))))

	def get_PCA_training_vectors(self):
		"""
		gets all training_vectors from the set of training files, normalizes them using normalize 
		vector and adds them all to a numpy array that gets returned
		"""
		training_sdf=[SDF(i) for i in list(self.training_)]
		
		self.biggest=0
		for item in training_sdf:
			self.biggest=max(self.biggest,item.dimensions()[0])
		return_train_vectors=None
		for tempsdf in training_sdf:
			vectorized=np.reshape(tempsdf.data(),(tempsdf.dimensions()[0],1))
			normal_vector=self.normalize_vector(vectorized,self.biggest)
			if return_train_vectors==None:
				return_train_vectors=normal_vector
			else:
				return_train_vectors=np.concatenate((return_train_vectors,normal_vector),axis=1)
		return return_train_vectors

	"""
	-any function begining with make creates the sklearn.decomposition framework for the specified 
	decomposition type 
	-any function begining with fit fits the training vectors to the decomposition framework
	-any function begining with transform transforms the training vectors based on the fitted 
	decomposition framework
	"""

        def render_sdf(self, a, thresh = 1e-3):
                h = plt.figure()
                ax = h.add_subplot(111, projection = '3d')

                surface_points = np.where(np.abs(a) < thresh)

                x = surface_points[0]
                y = surface_points[1]
                z = surface_points[2]
                ax.scatter(x, y, z)

                ax.set_xlabel('X')
                ax.set_ylabel('Y')
                ax.set_zlabel('Z')
                ax.set_xlim3d(0,a.shape[0])
                ax.set_ylim3d(0,a.shape[1])
                ax.set_zlim3d(0,a.shape[2])

                plt.show()


	def make_PCA(self):
		self.PCA_=skdec.PCA()#n_components='mle')

	def fit_PCA(self,training_vectors):
		self.PCA_.fit(training_vectors)                
                
	def make_FA(self):
		self.FA_=skdec.FactorAnalysis(n_components=len(list(self.training_)))

	def fit_FA(self,training_vectors):
		self.FA_.fit(training_vectors)

	def make_KPCA(self,kernel_option="rbf"):
		self.KPCA_=skdec.KernelPCA(gamma=0.1, kernel=kernel_option)

	def fit_KPCA(self,training_vectors):
		self.KPCA_.fit(training_vectors)

	def make_FICA(self):
		self.FICA_=skdec.FastICA(n_components=len(list(self.training_)))

	def fit_FICA(self,training_vectors):
		self.FICA_.fit(training_vectors)

	def make_DL(self,alpha_values):
		self.DL_.append(skdec.DictionaryLearning(n_components=len(list(self.training_)),alpha= alpha_values,transform_algorithm = 'omp'))

	def fit_DL(self,training_vectors):
		self.DL_[-1].fit(training_vectors)

	def load_PCA(self,vector_set):
		"""reinitializes our engine and loads a numpy set of vectors of dimension (self.biggest,1) 
		into self.engine_"""
		rbp = RandomBinaryProjections('rbp', 10)
		self.engine_ = Engine(self.PCA_.components_.shape[1], lshashes=[rbp])
                transformed_vectors = self.PCA_.transform(vector_set.T)
		for i in range(len(list(self.training_))):
			#vector=vector_set[:,i]                        
			#vector=np.reshape(vector,(self.biggest,1))
			#vector=self.PCA_.transform(vector)
			self.engine_.store_vector(transformed_vectors[i,:], self.training_[i])

	def load_FA(self,vector_set):
		rbp = RandomBinaryProjections('rbp',10)
		self.engine_ = Engine(self.biggest, lshashes=[rbp])
		for i in range(len(list(self.training_))):
			vector=vector_set[:,i]
			vector=np.reshape(vector,(self.biggest,1))
			vector=self.FA_.transform(vector)
			self.engine_.store_vector(vector[:,0],self.training_[i])

	def load_KPCA(self,vector_set):
		rbp = RandomBinaryProjections('rbp',10)
		self.engine_ = Engine(self.KPCA_.alphas_.shape[1], lshashes=[rbp])
                transformed_vectors = self.KPCA_.transform(vector_set.T)
		for i in range(len(list(self.training_))):
			#vector=vector_set[:,i]
			#vector=np.reshape(vector,(self.biggest,1))
			#vector=self.KPCA_.transform(vector)
			self.engine_.store_vector(transformed_vectors[i,:], self.training_[i])

	def load_FICA(self,vector_set):
		rbp = RandomBinaryProjections('rbp',10)
		self.engine_ = Engine(self.biggest, lshashes=[rbp])
		for i in range(len(list(self.training_))):
			vector=vector_set[:,i]
			vector=np.reshape(vector,(self.biggest,1))
			vector=self.FICA_.transform(vector)
			self.engine_.store_vector(vector[:,0],self.training_[i])

	def load_DL(self,vector_set):
		rbp = RandomBinaryProjections('rbp',10)
		self.engine_ = Engine(self.biggest, lshashes=[rbp])
		for i in range(len(list(self.training_))):
			vector=vector_set[:,i]
			vector=np.reshape(vector,(self.biggest,1))
			vector=self.DL_[-1].transform(vector)
			self.engine_.store_vector(vector[:,0],self.training_[i])		

	def engine_query(self,test_vector):
		"""
		queries the engine with a (self.biggest,1) dimension vector and returns the file_names of nearest
		neighbors and the results
		"""
		#print test_vector
		#reshaped=np.reshape(test_vector,(self.biggest,1))
		results = self.engine_.neighbours(test_vector.T)
		file_names = [i[1] for i in results]
		return file_names, results

	def setup_confusion(self):
		"""
		reinitializes the self.confusion_ confusion matrix variable
		"""
		self.confusion_={}
		self.confusion_[UNKNOWN_TAG] = {}
		for file_ in self.all_files_:
			category = cat50_file_category(file_)
			self.confusion_[category] = {}
		for query_cat in self.confusion_.keys():
			for pred_cat in self.confusion_.keys():
				self.confusion_[query_cat][pred_cat] = 0

	"""
	Makes a test vector by taking in an SDF, reshaping it, normalizing it, then returns a transformed
	version of that vector based on the corresponding decomposition model that was already trained
	"""

	def make_test_vector(self,sdf_array,vector_type):
		if vector_type=="PCA":
 			return self.make_PCA_test_vector(sdf_array)
		elif vector_type=="FA":
			return self.make_FA_test_vector(sdf_array)
		elif vector_type=="KPCA":
			return self.make_KPCA_test_vector(sdf_array)
		elif vector_type=="FICA":
			return self.make_FICA_test_vector(sdf_array)
		elif vector_type=="DL":
			return self.make_DL_test_vector(sdf_array)

	def make_DL_test_vector(self,sdf_array):
		reshaped=np.reshape(sdf_array.data(),(sdf_array.dimensions()[0],1))
		normalized=self.normalize_vector(reshaped,self.biggest)
		return self.DL_[-1].transform(normalized)[:,0]

	def make_FICA_test_vector(self,sdf_array):
		reshaped=np.reshape(sdf_array.data(),(sdf_array.dimensions()[0],1))
		normalized=self.normalize_vector(reshaped,self.biggest)
		return self.FICA_.transform(normalized)[:,0]

	def make_KPCA_test_vector(self,sdf_array):
		reshaped=np.reshape(sdf_array.data(),(sdf_array.dimensions()[0],1))
                return self.KPCA_.transform(reshaped.T)
#		reshaped=np.reshape(sdf_array.data(),(sdf_array.dimensions()[0],1))
#		normalized=self.normalize_vector(reshaped,self.biggest)
#		return self.KPCA_.transform(normalized)[:,0]

	def make_FA_test_vector(self,sdf_array):
		reshaped=np.reshape(sdf_array.data(),(sdf_array.dimensions()[0],1))
		normalized=self.normalize_vector(reshaped,self.biggest)
		return self.FA_.transform(normalized)[:,0]

	def make_PCA_test_vector(self,sdf_array):
		reshaped=np.reshape(sdf_array.data(),(sdf_array.dimensions()[0],1))
                return self.PCA_.transform(reshaped.T)
#               IPython.embed()
		
#		normalized=self.normalize_vector(reshaped,self.biggest)
		
#		return self.PCA_.transform(normalized)[:,0]

	"""
	querys the loaded and trained engine with each of your test vectors from make_train_test
		Returns
	        accuracy: float representing the accuracy of querying the nearpy engine with the test results
	        test_results: dictionary of the results from the "testing" for each of the sdf_files 
	"""
	def perform_tests(self,K,test_type):
		test_results={}
		for file_ in list(self.testing_):
			query_category=cat50_file_category(file_)
			print "Querying: %s with category %s "%(file_, query_category)
			converted = SDF(file_)
			test_vector=self.make_test_vector(converted,test_type)
			closest_names, closest_vals=self.engine_query(test_vector.T[:,0])

			pred_category=UNKNOWN_TAG

			if len(closest_names)>0:
				closest_category=closest_names[0]
				pred_category=cat50_file_category(closest_category)

				for i in range(1,min(K,len(closest_names))):
					closest_category = closest_names[i]
					potential_category = cat50_file_category(closest_category)

					if potential_category == query_category:
						pred_category = potential_category
			print "Result Category: %s"%(pred_category)

			self.confusion_[query_category][pred_category] += 1
			test_results[file_]= [(closest_names, closest_vals)]

		row_names=self.confusion_.keys()
		confusion_mat=np.zeros([len(row_names),len(row_names)])
		i=0
		for query_cat in self.confusion_.keys():
			j = 0
			for pred_cat in self.confusion_.keys():
				confusion_mat[i,j] = self.confusion_[query_cat][pred_cat]
				j += 1
			i += 1

	    # get true positives, etc for each category
		num_preds = len(self.testing_)
		tp = np.diag(confusion_mat)
		fp = np.sum(confusion_mat, axis=0) - np.diag(confusion_mat)
		fn = np.sum(confusion_mat, axis=1) - np.diag(confusion_mat)
		tn = num_preds * np.ones(tp.shape) - tp - fp - fn

	    # compute useful statistics
		recall = tp / (tp + fn)
		tnr = tn / (fp + tn)
		precision = tp / (tp + fp)
		npv = tn / (tn + fn)
		fpr = fp / (fp + tn)
		accuracy = np.sum(tp) / num_preds # correct predictions over entire dataset

	    # remove nans
		recall[np.isnan(recall)] = 0
		tnr[np.isnan(tnr)] = 0
		precision[np.isnan(precision)] = 0
		npv[np.isnan(npv)] = 0
		fpr[np.isnan(fpr)] = 0

		return accuracy, test_results, recall, tnr, precision,npv,fpr


        def vis_pca_components(self, num_comp_vis, thresh = 0.01, method = 'PCA'):
                PCA = self.PCA_
                if method == 'KPCA':
                        PCA = self.KPCA_
                num_components = PCA.components_.shape[0]
                num_components = min(num_comp_vis, num_components)

                comp_per_dim = int(math.ceil(math.sqrt(num_components)))
                h = plt.figure()
                for i in range(num_components):
                        ax = h.add_subplot(comp_per_dim, comp_per_dim, i+1, projection = '3d')
                        components = PCA.components_[i,:]
                        comp_grid = components.reshape(25, 25, 25)
                        
                        surface_points = np.where(np.abs(comp_grid) < thresh)
                        x = surface_points[0]
                        y = surface_points[1]
                        z = surface_points[2]

                        ax.scatter(x, y, z)
                        ax.set_xlabel('X')
                        ax.set_ylabel('Y')
                        ax.set_zlabel('Z')
                        ax.set_xlim3d(0,25)
                        ax.set_ylim3d(0,25)
                        ax.set_zlim3d(0,25)
                        ax.set_title('Component %d'%(i))
                plt.show()

        def vis_pca_component_slices(self, num_comp_vis, method = 'PCA'):
                PCA = self.PCA_
                if method == 'KPCA':
                        PCA = self.KPCA_
                num_components = PCA.components_.shape[0]
                num_components = min(num_comp_vis, num_components)

                comp_per_dim = int(math.ceil(math.sqrt(num_components)))
                plt.figure()
                for i in range(num_components):
                        plt.subplot(comp_per_dim, comp_per_dim, i+1)
                        components = PCA.components_[i,:]
                        comp_grid = components.reshape(25, 25, 25)
                        comp_slice = comp_grid[:,:,12]

                        plt.imshow(comp_slice)
                        plt.title('Component %d XY Plane'%(i))

                plt.figure()
                for i in range(num_components):
                        plt.subplot(comp_per_dim, comp_per_dim, i+1)
                        components = PCA.components_[i,:]
                        comp_grid = components.reshape(25, 25, 25)
                        comp_slice = comp_grid[:,12,:]

                        plt.imshow(comp_slice)
                        plt.title('Component %d XZ Plane'%(i))

                plt.figure()
                for i in range(num_components):
                        plt.subplot(comp_per_dim, comp_per_dim, i+1)
                        components = PCA.components_[i,:]
                        comp_grid = components.reshape(25, 25, 25)
                        comp_slice = comp_grid[12,:,:]

                        plt.imshow(comp_slice)
                        plt.title('Component %d YZ Plane'%(i))
                plt.show()


	"""
	runs perform_tests on a specific type of decomposition after creating that decomposition type 
	framework with the training vectors and loading those training vectors into the engine

	K is the number of neighbors to check
	"""
	def perform_PCA_tests(self,K):
		train_vectors=self.get_PCA_training_vectors()
		self.make_PCA()
                print 'Fitting PCA'
		self.fit_PCA(train_vectors.T)
                print 'Loading PCA'
		self.load_PCA(train_vectors)
                print 'Setup confusion'
		self.setup_confusion()
                print 'Eval accuracy'
                #IPython.embed()
		accuracy,test_results, recall, tnr, precision,npv,fpr=self.perform_tests(K,"PCA")
		return accuracy,test_results, recall, tnr, precision,npv,fpr

	def perform_FA_tests(self,K):
		train_vectors=self.get_PCA_training_vectors()
		self.make_FA()
		self.fit_FA(train_vectors)
		self.load_FA(train_vectors)
		self.setup_confusion()
		accuracy,test_results, recall, tnr, precision,npv,fpr=self.perform_tests(K,"FA")
		return accuracy,test_results, recall, tnr, precision,npv,fpr

	def perform_KPCA_tests(self,K,kernel="rbf"):
		train_vectors=self.get_PCA_training_vectors()
		self.make_KPCA(kernel_option=kernel)
                print 'Fitting KCPA'
		self.fit_KPCA(train_vectors.T)
                print 'Loading KPCA'
		self.load_KPCA(train_vectors)
		self.setup_confusion()
		accuracy,test_results, recall, tnr, precision,npv,fpr=self.perform_tests(K,"KPCA")
                IPython.embed()
		return accuracy,test_results, recall, tnr, precision,npv,fpr


	def perform_FICA_tests(self,K):
		train_vectors=self.get_PCA_training_vectors()
		self.make_FICA()
		self.fit_FICA(train_vectors)
		self.load_FICA(train_vectors)
		self.setup_confusion()
		accuracy,test_results, recall, tnr, precision,npv,fpr=self.perform_tests(K,"FICA")
		return accuracy,test_results, recall, tnr, precision,npv,fpr

	def perform_DL_tests(self,K,alpha):
		train_vectors=self.get_PCA_training_vectors()
		self.make_DL(alpha_values=alpha)
		self.fit_DL(train_vectors)
		self.load_DL(train_vectors)
		self.setup_confusion()
		accuracy,test_results, recall, tnr, precision,npv,fpr=self.perform_tests(K,"DL")
		return accuracy,test_results, recall, tnr, precision,npv,fpr

	def get_engine(self):
		return self.engine_

	def get_PCA(self):
		return self.PCA_

	def get_FA(self):
		return self.FA_

	def get_KPCA(self):
		return self.KPCA_

	def get_FICA(self):
		return self.FICA_

	def get_DL(self):
		return self.DL_

	def get_explained_variance_ratio(self):
		return self.PCA_.explained_variance_ratio_
Ejemplo n.º 41
0
class LSHSearch:
    def __init__(self, feature_file, dimension, neighbour, lsh_project_num):
        self.feature_file = feature_file
        self.dimension = dimension
        self.neighbour = neighbour
        self.face_feature = defaultdict(str)
        self.ground_truth = defaultdict(int)

        # Create permutations meta-hash
        permutations2 = HashPermutationMapper('permut2')

        tmp_feature = defaultdict(str)
        with open(feature_file, 'rb') as f:
            reader = csv.reader(f, delimiter=' ')
            for name, feature in reader:
                tmp_feature[name] = feature

        matrix = []
        label = []
        for item in tmp_feature.keys():
            v = map(float, tmp_feature[item].split(','))
            matrix.append(np.array(v))
            label.append(item)
        random.shuffle(matrix)
        print 'PCA matric : ', len(matrix)

        rbp_perm2 = PCABinaryProjections('testPCABPHash', lsh_project_num, matrix)
        permutations2.add_child_hash(rbp_perm2)

        # Create engine
        nearest = NearestFilter(self.neighbour)
        self.engine = Engine(self.dimension, lshashes=[permutations2], distance=CosineDistance(), vector_filters=[nearest])

    def build(self):
        with open(self.feature_file, 'rb') as f:
            reader = csv.reader(f, delimiter=' ')
            for name, feature in reader:
                self.face_feature[name] = feature
                person = '_'.join(name.split('_')[:-1])
                self.ground_truth[person] += 1 

        for item in self.face_feature.keys():
            v = map(float, self.face_feature[item].split(','))
            self.engine.store_vector(v, item)
 
    def query(self, person_list):
        dists = []
        scores = []
        for person in person_list:
            query = map(float, self.face_feature[person].split(','))
            print '\nNeighbour distances with mutliple binary hashes:'
            print '  -> Candidate count is %d' % self.engine.candidate_count(query)
            results = self.engine.neighbours(query)
            dists = dists + [x[1] for x in results]
            scores = scores + [x[2] for x in results]
        t_num = [self.ground_truth['_'.join(x.split('_')[:-1])] for x in dists]
        res = zip(dists, scores, t_num)
        res.sort(key = lambda t: t[1])
        res1 = self.f7(res, person_list)
        return res1[:self.neighbour]

    def true_num(self, person):
        return self.ground_truth[person]

    def f7(self, zip_seq, person_list):
        seen = set()
        seen_add = seen.add
        return [ x for x in zip_seq if not (x[0] in seen or seen_add(x[0]) or x[0] in person_list)]
Ejemplo n.º 42
0
class LSHSearch:
    def __init__(self, feature_file, dimension, neighbour, lsh_project_num):
        self.feature_file = feature_file
        self.dimension = dimension
        self.neighbour = neighbour
        self.face_feature = defaultdict(str)
        self.ground_truth = defaultdict(int)

        # Create permutations meta-hash
        self.permutations2 = HashPermutationMapper('permut2')

        tmp_feature = defaultdict(str)
        with open(feature_file, 'rb') as f:
            reader = csv.reader(f, delimiter=' ')
            for name, feature in reader:
                tmp_feature[name] = feature

        matrix = []
        label = []
        for item in tmp_feature.keys():
            v = map(float, tmp_feature[item].split(','))
            matrix.append(np.array(v))
            label.append(item)
        random.shuffle(matrix)
        print 'PCA matric : ', len(matrix)

        rbp_perm2 = PCABinaryProjections(
            'testPCABPHash', lsh_project_num, matrix)
        self.permutations2.add_child_hash(rbp_perm2)

        # Create engine
        nearest = NearestFilter(self.neighbour)
        self.engine = Engine(
            self.dimension,
            lshashes=[self.permutations2],
            distance=CosineDistance(),
            vector_filters=[nearest])

    def build(self):
        with open(self.feature_file, 'rb') as f:
            reader = csv.reader(f, delimiter=' ')
            for name, feature in reader:
                self.face_feature[name] = feature
                person = '_'.join(name.split('_')[:-1])
                self.ground_truth[person] += 1

        for item in self.face_feature.keys():
            v = map(float, self.face_feature[item].split(','))
            self.engine.store_vector(v, item)

    def update(self, person, feature):
        print feature
        v = map(float, feature.split(','))
        epoch_time = long(time.time())
        f_name = person + '_' + str(epoch_time)
        print f_name
        self.engine.store_vector(v, f_name)

    def query(self, person_feature):
        dists = []
        scores = []

        query = map(float, person_feature.split(','))
        # print '\nNeighbour distances with mutliple binary hashes:'
        # print '  -> Candidate count is %d' % self.engine.candidate_count(query)
        results = self.engine.neighbours(query)
        dists = dists + [x[1] for x in results]
        scores = scores + [x[2] for x in results]

        res = zip(dists, scores)
        res.sort(key=lambda t: t[1])
        return res[:self.neighbour]
Ejemplo n.º 43
0
class RMAX_repr(Representation):
    """
    Identical to Tabular representation (ie assigns a binary feature function 
    f_{d}() to each possible discrete state *d* in the domain, with
    f_{d}(s) = 1 when d=s, 0 elsewhere.
    HOWEVER, unlike *Tabular*, feature functions are only created for *s* which
    have been encountered in the domain, not instantiated for every single 
    state at the outset.

    """
    def __init__(self, domain, Rmax, LQ, k = 1, epsilon_d = 0.01):
        # LQ is the lipschitz constant - 10**3 according to the paper (by Cross Validn)
        self.LQ = LQ
        self.gamma = domain.discount_factor

        self.rmax = Rmax
        self.qmax = Rmax / (1-self.gamma)
        self.qmax_tilda = Rmax + self.gamma * self.qmax
        self.epsilon = epsilon_d

        # Approximate k-NN is used when finding the Q value of a point
        self.k = k

        # We also keep track of the states sampled so far
        self.sample_list = [0]*(2*100000)
        self.list_idx = 0
        # And a dictionary for quick lookups of already computed values
        self.sample_values = {}

        # And we use an LSH to find the approximate k-Nearest neighbours
        # by training it on every s, a, r, s' tuple we see
        self.init_randomization()
        
        super(
            RMAX_repr,
            self).__init__(
            domain)

    
    def init_randomization(self):
        rbp = RandomBinaryProjections('rbp', 10)
        from nearpy.distances import ChebyshevDistance
        self.engine = Engine(7, lshashes = [rbp], vector_filters=[NearestFilter(self.k)], distance=ChebyshevDistance())

    def is_known(self, s, a):
        # A s, a pair is 'known' if LQ * d(s, a, s', a') < epsilon_d
        indices = self.approx_nn(s, a)
        if not indices:
            return False

        for idx in indices:
            s_p, a_p = self.sample_list[idx]
            if self.LQ * self.d(s, a, s_p, a_p) > self.epsilon:
                return False
        return True

    def pre_discover(self, s, p_terminal, a, r, ns, terminal):
        # In the learning stage, if sa is not 'known' add it to the sample list
        # and its value to sample value.
        if not self.is_known(s, a):
            x = r + self.gamma * max(self.Q_tilda(ns, a_p) for a_p in range(self.actions_num))

            self.engine.store_vector(np.append(s, a), self.list_idx)            
            self.sample_list[self.list_idx]= (s, a)
            self.list_idx+=1
            self.sample_values[self.sa_tuple(s, a)] = x

            #self.LSH.partial_fit(np.append(s, a))
        super(RMAX_repr, self).pre_discover(s, p_terminal, a, ns, terminal)

    # Compute a distance metric between (s, a) and (ns, na).
    # Using max-norm as in the paper for now.
    def d(self, s, a, ns, na):
        # Create one big s,a array
        sa = np.append(s, a)
        nsa = np.append(ns, na)
        # Use scipy to compute the chebyshev distance => Max norm
        return distance(sa, nsa)

    def approx_nn(self, s, a):
        #dist, indices = self.LSH.kneighbors(np.append(s, a))
        # returns a list of
        l = self.engine.neighbours(np.append(s, a))
        indices = [elem[1] for elem in l]
        return indices

    def sa_tuple(self, s, a):
        return tuple(np.append(s, a))
    
    # The approximate Q function 
    def Q_tilda(self, s, a):
        k = self.k
        q = 0.0
        # First get the k-nearest sampled neighbours to this point using LSH
        indices = self.approx_nn(s, a)
        num_neighbors = 0

        for index in indices:
            sj, aj = self.sample_list[index]
            dij = self.d(s, a, sj, aj)
            if dij <= (self.qmax / self.LQ):
                xj = self.sample_values[self.sa_tuple(sj, aj)]
                q += dij * self.LQ + xj
                num_neighbors += 1

        # In case there were less than k neighbors - Use Qmax_tilda for the remaining
        for i in range(num_neighbors, k):
            q += self.qmax_tilda
        # Return the average Q
        return q/k
        

    def Qs(self, s, terminal, phi_s=None):
        # Q -> Array of Q(s, a) values for this state
        # A -> Corresponding IDs

        # Before any learning is done, the experiment calls the policy to
        # estimate prior performance. In that case, the LSHF would throw a 
        # Value Error. We pre-empt that here
        Q = np.zeros((self.actions_num))
        #try :
        #    self.LSH.kneighbors(np.append(s, 0))
        #except ValueError:
        #    return Q
    
        for a in range(self.actions_num):
            Q[a] = self.Q_tilda(s, a)
        return Q
Ejemplo n.º 44
0
class TestRandomBinaryProjectionTree(unittest.TestCase):

    def setUp(self):
        self.memory = MemoryStorage()
        self.redis_object = Redis(host='localhost',
                                  port=6379, db=0)
        self.redis_storage = RedisStorage(self.redis_object)

    def test_retrieval(self):
        # We want 12 projections, 20 results at least
        rbpt = RandomBinaryProjectionTree('testHash', 12, 20)

        # Create engine for 100 dimensional feature space, do not forget to set
        # nearest filter to 20, because default is 10
        self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)])

        # First insert 200000 random vectors
        #print 'Indexing...'
        for k in range(200000):
            x = numpy.random.randn(100)
            x_data = 'data'
            self.engine.store_vector(x, x_data)

        # Now do random queries and check result set size
        #print 'Querying...'
        for k in range(10):
            x = numpy.random.randn(100)
            n = self.engine.neighbours(x)
            #print "Candidate count = %d" % self.engine.candidate_count(x)
            #print "Result size = %d" % len(n)
            self.assertEqual(len(n), 20)

    def test_storage_memory(self):
        # We want 10 projections, 20 results at least
        rbpt = RandomBinaryProjectionTree('testHash', 10, 20)

        # Create engine for 100 dimensional feature space
        self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)])

        # First insert 2000 random vectors
        for k in range(2000):
            x = numpy.random.randn(100)
            x_data = 'data'
            self.engine.store_vector(x, x_data)

        self.memory.store_hash_configuration(rbpt)

        rbpt2 = RandomBinaryProjectionTree(None, None, None)
        rbpt2.apply_config(self.memory.load_hash_configuration('testHash'))

        self.assertEqual(rbpt.dim, rbpt2.dim)
        self.assertEqual(rbpt.hash_name, rbpt2.hash_name)
        self.assertEqual(rbpt.projection_count, rbpt2.projection_count)

        for i in range(rbpt.normals.shape[0]):
            for j in range(rbpt.normals.shape[1]):
                self.assertEqual(rbpt.normals[i, j], rbpt2.normals[i, j])

        # Now do random queries and check result set size
        for k in range(10):
            x = numpy.random.randn(100)
            keys1 = rbpt.hash_vector(x, querying=True)
            keys2 = rbpt2.hash_vector(x, querying=True)
            self.assertEqual(len(keys1), len(keys2))
            for k in range(len(keys1)):
                self.assertEqual(keys1[k], keys2[k])

    def test_storage_redis(self):
        # We want 10 projections, 20 results at least
        rbpt = RandomBinaryProjectionTree('testHash', 10, 20)

        # Create engine for 100 dimensional feature space
        self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)])

        # First insert 2000 random vectors
        for k in range(2000):
            x = numpy.random.randn(100)
            x_data = 'data'
            self.engine.store_vector(x, x_data)


        self.redis_storage.store_hash_configuration(rbpt)

        rbpt2 = RandomBinaryProjectionTree(None, None, None)
        rbpt2.apply_config(self.redis_storage.load_hash_configuration('testHash'))

        self.assertEqual(rbpt.dim, rbpt2.dim)
        self.assertEqual(rbpt.hash_name, rbpt2.hash_name)
        self.assertEqual(rbpt.projection_count, rbpt2.projection_count)

        for i in range(rbpt.normals.shape[0]):
            for j in range(rbpt.normals.shape[1]):
                self.assertEqual(rbpt.normals[i, j], rbpt2.normals[i, j])

        # Now do random queries and check result set size
        for k in range(10):
            x = numpy.random.randn(100)
            keys1 = rbpt.hash_vector(x, querying=True)
            keys2 = rbpt2.hash_vector(x, querying=True)
            self.assertEqual(len(keys1), len(keys2))
            for k in range(len(keys1)):
                self.assertEqual(keys1[k], keys2[k])
Ejemplo n.º 45
0
for next_read_line in f:
    next_read_line = next_read_line.rstrip()

    split_arr = next_read_line.split(" ")
    split_arr = split_arr[1:]
    split_arr = list(map(float, split_arr))

    vector = numpy.asarray(split_arr)

    if (i == 639):
        query = vector
#     print (query)
    else:
        vec_data = numpy.append(vector, i)
        engine.store_vector(vector, tuple(vec_data))

    i += 1

# Get nearest neighbors:
N = engine.neighbours(query)

# Number of nearest neighbors:
print(len(N))

print("Nearest Neighbors")

for x in N:
    #Printing the id of the vector here as needed:
    # print (x[1][dimension])
    print(x[1][dimension])
Ejemplo n.º 46
0
def example1():

    # Dimension of feature space
    DIM = 100

    # Number of data points (dont do too much because of exact search)
    POINTS = 10000

    print 'Creating engines'

    # We want 12 projections, 20 results at least
    rbpt = RandomBinaryProjectionTree('rbpt', 20, 20)

    # Create engine 1
    engine_rbpt = Engine(DIM, lshashes=[rbpt], distance=CosineDistance())

    # Create binary hash as child hash
    rbp = RandomBinaryProjections('rbp1', 20)

    # Create engine 2
    engine = Engine(DIM, lshashes=[rbp], distance=CosineDistance())

    # Create permutations meta-hash
    permutations = HashPermutations('permut')

    # Create binary hash as child hash
    rbp_perm = RandomBinaryProjections('rbp_perm', 20)
    rbp_conf = {'num_permutation':50,'beam_size':10,'num_neighbour':100}

    # Add rbp as child hash of permutations hash
    permutations.add_child_hash(rbp_perm, rbp_conf)

    # Create engine 3
    engine_perm = Engine(DIM, lshashes=[permutations], distance=CosineDistance())

    # Create permutations meta-hash
    permutations2 = HashPermutationMapper('permut2')

    # Create binary hash as child hash
    rbp_perm2 = RandomBinaryProjections('rbp_perm2', 12)

    # Add rbp as child hash of permutations hash
    permutations2.add_child_hash(rbp_perm2)

    # Create engine 3
    engine_perm2 = Engine(DIM, lshashes=[permutations2], distance=CosineDistance())

    print 'Indexing %d random vectors of dimension %d' % (POINTS, DIM)

    # First index some random vectors
    matrix = numpy.zeros((POINTS,DIM))
    for i in xrange(POINTS):
        v = numpy.random.randn(DIM)
        matrix[i] = v
        engine.store_vector(v)
        engine_rbpt.store_vector(v)
        engine_perm.store_vector(v)
        engine_perm2.store_vector(v)

    print 'Buckets 1 = %d' % len(engine.storage.buckets['rbp1'].keys())
    print 'Buckets 2 = %d' % len(engine_rbpt.storage.buckets['rbpt'].keys())

    print 'Building permuted index for HashPermutations'

    # Then update permuted index
    permutations.build_permuted_index()

    print 'Generate random data'

    # Get random query vector
    query = numpy.random.randn(DIM)

    # Do random query on engine 1
    print '\nNeighbour distances with RandomBinaryProjectionTree:'
    print '  -> Candidate count is %d' % engine_rbpt.candidate_count(query)
    results = engine_rbpt.neighbours(query)
    dists = [x[2] for x in results]
    print dists

    # Do random query on engine 2
    print '\nNeighbour distances with RandomBinaryProjections:'
    print '  -> Candidate count is %d' % engine.candidate_count(query)
    results = engine.neighbours(query)
    dists = [x[2] for x in results]
    print dists

    # Do random query on engine 3
    print '\nNeighbour distances with HashPermutations:'
    print '  -> Candidate count is %d' % engine_perm.candidate_count(query)
    results = engine_perm.neighbours(query)
    dists = [x[2] for x in results]
    print dists

    # Do random query on engine 4
    print '\nNeighbour distances with HashPermutations2:'
    print '  -> Candidate count is %d' % engine_perm2.candidate_count(query)
    results = engine_perm2.neighbours(query)
    dists = [x[2] for x in results]
    print dists

    # Real neighbours
    print '\nReal neighbour distances:'
    query = query.reshape((1,DIM))
    dists = CosineDistance().distance_matrix(matrix,query)
    dists = dists.reshape((-1,))
    dists = sorted(dists)
    print dists[:10]