Beispiel #1
0
    def setUp(self):
        self.V = []
        self.V.append((numpy.array([0]), 'data1', 0.4))
        self.V.append((numpy.array([1]), 'data2', 0.9))
        self.V.append((numpy.array([2]), 'data3', 1.4))
        self.V.append((numpy.array([3]), 'data4', 2.1))
        self.V.append((numpy.array([4]), 'data5', 0.1))
        self.V.append((numpy.array([5]), 'data6', 8.7))
        self.V.append((numpy.array([6]), 'data7', 3.4))
        self.V.append((numpy.array([7]), 'data8', 2.8))

        self.threshold_filter = DistanceThresholdFilter(1.0)
        self.nearest_filter = NearestFilter(5)
        self.unique = UniqueFilter()
Beispiel #2
0
    def test_random_discretized_projections(self):
        dim = 4
        vector_count = 5000
        vectors = numpy.random.randn(dim, vector_count)

        # First get recall and precision for one 1-dim random hash
        rdp = RandomDiscretizedProjections('rdp', 1, 0.01)
        nearest = NearestFilter(10 + 1)
        engine = Engine(dim, lshashes=[rdp], vector_filters=[nearest])
        exp = RecallPrecisionExperiment(10, vectors)
        result = exp.perform_experiment([engine])

        recall1 = result[0][0]
        precision1 = result[0][1]
        searchtime1 = result[0][2]

        print('\nRecall RDP: %f, Precision RDP: %f, SearchTime RDP: %f\n' % \
            (recall1, precision1, searchtime1))

        # Then get recall and precision for one 4-dim random hash
        rdp = RandomDiscretizedProjections('rdp', 2, 0.2)
        engine = Engine(dim, lshashes=[rdp], vector_filters=[nearest])
        result = exp.perform_experiment([engine])

        recall2 = result[0][0]
        precision2 = result[0][1]
        searchtime2 = result[0][2]

        print('\nRecall RDP: %f, Precision RDP: %f, SearchTime RDP: %f\n' % \
            (recall2, precision2, searchtime2))

        # Many things are random here, but the precision should increase
        # with dimension
        self.assertTrue(precision2 > precision1)
Beispiel #3
0
    def test_experiment_with_list_1(self):
        dim = 50
        vector_count = 100
        vectors = []
        for index in range(vector_count):
            vectors.append(numpy.random.randn(dim))
        unibucket = UniBucket('testHash')
        nearest = NearestFilter(10 + 1)
        engine = Engine(dim, lshashes=[unibucket], vector_filters=[nearest])
        exp = RecallPrecisionExperiment(10, vectors)
        result = exp.perform_experiment([engine])

        # Both recall and precision must be one in this case
        self.assertEqual(result[0][0], 1.0)
        self.assertEqual(result[0][1], 1.0)
Beispiel #4
0
    def test_experiment_with_unibucket_3(self):
        dim = 50
        vector_count = 100
        vectors = numpy.random.randn(dim, vector_count)
        unibucket = UniBucket('testHash')
        nearest = NearestFilter(5 + 1)
        engine = Engine(dim, lshashes=[unibucket], vector_filters=[nearest])
        exp = RecallPrecisionExperiment(10, vectors)
        result = exp.perform_experiment([engine])

        # In this case recall is only 0.5
        # because the engine returns 5 nearest, but
        # the experiment looks for 10 nearest.
        self.assertEqual(result[0][0], 0.5)
        self.assertEqual(result[0][1], 1.0)
Beispiel #5
0
    def test_experiment_with_unibucket_1(self):
        dim = 50
        vector_count = 100
        vectors = numpy.random.randn(dim, vector_count)
        unibucket = UniBucket('testHash')
        nearest = NearestFilter(10 + 1)
        engine = Engine(dim,
                        lshashes=[unibucket],
                        vector_filters=[nearest],
                        distance=EuclideanDistance())
        exp = RecallPrecisionExperiment(10, vectors)
        result = exp.perform_experiment([engine])

        # Both recall and precision must be one in this case
        self.assertEqual(result[0][0], 1.0)
        self.assertEqual(result[0][1], 1.0)
Beispiel #6
0
    def test_experiment_with_list_2(self):
        dim = 50
        vector_count = 100
        vectors = []
        for index in range(vector_count):
            vectors.append(numpy.random.randn(dim))
        unibucket = UniBucket('testHash')
        nearest = NearestFilter(10 + 1)
        engine = Engine(dim, lshashes=[unibucket], vector_filters=[nearest])
        exp = RecallPrecisionExperiment(5, vectors)
        result = exp.perform_experiment([engine])

        # In this case precision is only 0.5
        # because the engine returns 10 nearest, but
        # the experiment only looks for 5 nearest.
        self.assertEqual(result[0][0], 1.0)
        self.assertEqual(result[0][1], 0.5)
Beispiel #7
0
    def test_random_binary_projections(self):
        dim = 4
        vector_count = 5000
        vectors = numpy.random.randn(dim, vector_count)

        # First get recall and precision for one 1-dim random hash
        rbp = RandomBinaryProjections('rbp', 32)
        nearest = NearestFilter(10 + 1)
        engine = Engine(dim, lshashes=[rbp], vector_filters=[nearest])
        exp = RecallPrecisionExperiment(10, vectors)
        result = exp.perform_experiment([engine])

        recall1 = result[0][0]
        precision1 = result[0][1]
        searchtime1 = result[0][2]

        print('\nRecall RBP: %f, Precision RBP: %f, SearchTime RBP: %f\n' % \
            (recall1, precision1, searchtime1))
Beispiel #8
0
class TestVectorFilters(unittest.TestCase):
    def setUp(self):
        self.V = []
        self.V.append((numpy.array([0]), 'data1', 0.4))
        self.V.append((numpy.array([1]), 'data2', 0.9))
        self.V.append((numpy.array([2]), 'data3', 1.4))
        self.V.append((numpy.array([3]), 'data4', 2.1))
        self.V.append((numpy.array([4]), 'data5', 0.1))
        self.V.append((numpy.array([5]), 'data6', 8.7))
        self.V.append((numpy.array([6]), 'data7', 3.4))
        self.V.append((numpy.array([7]), 'data8', 2.8))

        self.threshold_filter = DistanceThresholdFilter(1.0)
        self.nearest_filter = NearestFilter(5)
        self.unique = UniqueFilter()

    def test_thresholding(self):
        result = self.threshold_filter.filter_vectors(self.V)
        self.assertEqual(len(result), 3)
        self.assertIn(self.V[0], result)
        self.assertIn(self.V[1], result)
        self.assertIn(self.V[4], result)

    def test_nearest(self):
        result = self.nearest_filter.filter_vectors(self.V)
        self.assertEqual(len(result), 5)
        self.assertIn(self.V[0], result)
        self.assertIn(self.V[1], result)
        self.assertIn(self.V[4], result)
        self.assertIn(self.V[2], result)
        self.assertIn(self.V[3], result)

    def test_unique(self):
        W = self.V
        W.append((numpy.array([7]), 'data8', 2.8))
        W.append((numpy.array([0]), 'data1', 2.8))
        W.append((numpy.array([1]), 'data2', 2.8))
        W.append((numpy.array([6]), 'data7', 2.8))

        result = self.unique.filter_vectors(W)
        self.assertEqual(len(result), 8)
    def test_retrieval(self):
        # We want 12 projections, 20 results at least
        rbpt = RandomBinaryProjectionTree('testHash', 12, 20)

        # Create engine for 100 dimensional feature space, do not forget to set
        # nearest filter to 20, because default is 10
        self.engine = Engine(100,
                             lshashes=[rbpt],
                             vector_filters=[NearestFilter(20)])

        # First insert 200000 random vectors
        for k in range(200000):
            x = numpy.random.randn(100)
            x_data = 'data {}'.format(k)
            self.engine.store_vector(x, x_data)

        # Now do random queries and check result set size
        for k in range(10):
            x = numpy.random.randn(100)
            n = self.engine.neighbours(x)
            self.assertEqual(len(n), 20)
    def test_storage_redis(self):
        # We want 10 projections, 20 results at least
        rbpt = RandomBinaryProjectionTree('testHash', 10, 20)

        # Create engine for 100 dimensional feature space
        self.engine = Engine(100,
                             lshashes=[rbpt],
                             vector_filters=[NearestFilter(20)])

        # First insert 2000 random vectors
        for k in range(2000):
            x = numpy.random.randn(100)
            x_data = 'data'
            self.engine.store_vector(x, x_data)

        self.redis_storage.store_hash_configuration(rbpt)

        rbpt2 = RandomBinaryProjectionTree(None, None, None)
        rbpt2.apply_config(
            self.redis_storage.load_hash_configuration('testHash'))

        self.assertEqual(rbpt.dim, rbpt2.dim)
        self.assertEqual(rbpt.hash_name, rbpt2.hash_name)
        self.assertEqual(rbpt.projection_count, rbpt2.projection_count)

        for i in range(rbpt.normals.shape[0]):
            for j in range(rbpt.normals.shape[1]):
                self.assertEqual(rbpt.normals[i, j], rbpt2.normals[i, j])

        # Now do random queries and check result set size
        for k in range(10):
            x = numpy.random.randn(100)
            keys1 = rbpt.hash_vector(x, querying=True)
            keys2 = rbpt2.hash_vector(x, querying=True)
            self.assertEqual(len(keys1), len(keys2))
            for k in range(len(keys1)):
                self.assertEqual(keys1[k], keys2[k])
    def __init__(self,
                 dim,
                 lshashes=None,
                 distance=None,
                 fetch_vector_filters=None,
                 vector_filters=None,
                 storage=None):
        """ Keeps the configuration. """
        if lshashes is None:
            lshashes = [RandomBinaryProjections('default', 10)]
        self.lshashes = lshashes
        if distance is None: distance = CosineDistance()
        self.distance = distance
        if vector_filters is None: vector_filters = [NearestFilter(10)]
        self.vector_filters = vector_filters
        if fetch_vector_filters is None:
            fetch_vector_filters = [UniqueFilter()]
        self.fetch_vector_filters = fetch_vector_filters
        if storage is None: storage = MemoryStorage()
        self.storage = storage

        # Initialize all hashes for the data space dimension.
        for lshash in self.lshashes:
            lshash.reset(dim)
Beispiel #12
0
# Create data set from two clusters
vectors = []

center = numpy.random.randn(dimension)
for index in xrange(vector_count / 2):
    vector = center + 0.01 * numpy.random.randn(dimension)
    vectors.append(vector)

center = numpy.random.randn(dimension)
for index in xrange(vector_count / 2):
    vector = center + 0.01 * numpy.random.randn(dimension)
    vectors.append(vector)

# We are looking for the N closest neighbours
N = 20
nearest = NearestFilter(N)

# We will fill this array with all the engines we want to test
engines = []

print 'Creating engines...'

# We are going to test these bin widths
bin_widths = [0.01 * x for x in range(1, 5)]
# Create engines for all configurations
for bin_width in bin_widths:
    # Use four random 1-dim discretized projections
    rdp1 = RandomDiscretizedProjections('rdp1', 4, bin_width)
    rdp2 = RandomDiscretizedProjections('rdp2', 4, bin_width)
    rdp3 = RandomDiscretizedProjections('rdp3', 4, bin_width)
    rdp4 = RandomDiscretizedProjections('rdp4', 4, bin_width)