Ejemplo n.º 1
0
    def _append_distances(self, v, distance, candidates):
        """ Apply distance implementation if specified """
        if distance:
            # Normalize vector (stored vectors are normalized)
            nv = unitvec(v)
            candidates = [(x[0], x[1], self.distance.distance(x[0], nv)) for x
                            in candidates]

        return candidates
Ejemplo n.º 2
0
    def _append_distances(self, v, distance, candidates):
        """ Apply distance implementation if specified """
        if distance:
            # Normalize vector (stored vectors are normalized)
            nv = unitvec(v)
            candidates = [(x[0], x[1], self.distance.distance(x[0], nv))
                          for x in candidates]

        return candidates
Ejemplo n.º 3
0
 def test_retrieval_sparse(self):
     for k in range(100):
         self.engine.clean_all_buckets()
         x = scipy.sparse.rand(1000, 1, density=0.05)
         x_data = 'data'
         self.engine.store_vector(x, x_data)
         n = self.engine.neighbours(x)
         y, y_data, y_distance = n[0]
         normalized_x = unitvec(x)
         delta = 0.000000001
         self.assertAlmostEqual(numpy.abs((normalized_x - y)).max(), 0, delta=delta)
         self.assertEqual(y_data, x_data)
         self.assertAlmostEqual(y_distance, 0.0, delta=delta)
Ejemplo n.º 4
0
    def __init__(self, N, vectors, coverage_ratio=0.2):
        """
        Performs exact nearest neighbour search on the data set.

        vectors can either be a numpy matrix with all the vectors
        as columns OR a python array containing the individual
        numpy vectors.
        """
        # We need a dict from vector string representation to index
        self.vector_dict = {}
        self.N = N
        self.coverage_ratio = coverage_ratio
        numpy_vectors = numpy_array_from_list_or_numpy_array(vectors)

        # Get numpy array representation of input
        self.vectors = numpy.vstack([unitvec(v) for v in numpy_vectors.T])

        # Build map from vector string representation to vector
        for index, v in enumerate(self.vectors):
            self.vector_dict[self.__vector_to_string(v)] = index

        # Determine the indices of query vectors used for comparance
        # with approximated search.
        query_count = numpy.floor(self.coverage_ratio *
                                  len(self.vectors))
        self.query_indices = []
        for k in range(int(query_count)):
            index = numpy.floor(k * (float(len(self.vectors)) / query_count))
            index = min(index, len(self.vectors) - 1)
            self.query_indices.append(int(index))

        print('\nStarting exact search (query set size=%d)...\n' % query_count)

        # For each query vector get the closest N neighbours
        self.closest = {}
        self.exact_search_time_per_vector = 0.0

        for index in self.query_indices:
            v = self.vectors[index, numpy.newaxis]
            exact_search_start_time = time.time()
            D = cdist(v, self.vectors, 'euclidean')
            self.closest[index] = scipy.argsort(D)[0, 1:N+1]

            # Save time needed for exact search
            exact_search_time = time.time() - exact_search_start_time
            self.exact_search_time_per_vector += exact_search_time

        print('Done with exact search...\n')

        # Normalize search time
        self.exact_search_time_per_vector /= float(len(self.query_indices))
Ejemplo n.º 5
0
 def store_vector(self, v, data=None):
     """
     Hashes vector v and stores it in all matching buckets in the storage.
     The data argument must be JSON-serializable. It is stored with the
     vector and will be returned in search results.
     """
     # We will store the normalized vector (used during retrieval)
     nv = unitvec(v)
     # Store vector in each bucket of all hashes
     for lshash in self.lshashes:
         for bucket_key in lshash.hash_vector(v):
             #print 'Storying in bucket %s one vector' % bucket_key
             self.storage.store_vector(lshash.hash_name, bucket_key,
                                       nv, data)
Ejemplo n.º 6
0
 def store_vector(self, v, data=None):
     """
     Hashes vector v and stores it in all matching buckets in the storage.
     The data argument must be JSON-serializable. It is stored with the
     vector and will be returned in search results.
     """
     # We will store the normalized vector (used during retrieval)
     nv = unitvec(v)
     # Store vector in each bucket of all hashes
     for lshash in self.lshashes:
         for bucket_key in lshash.hash_vector(v):
             #print 'Storying in bucket %s one vector' % bucket_key
             self.storage.store_vector(lshash.hash_name, bucket_key, nv,
                                       data)
Ejemplo n.º 7
0
 def store_many_vectors(self, vs, data=None):
     """
     Store a batch of vectors.
     Hashes vector vs and stores them in all matching buckets in the storage.
     The data argument must be either None or a list of JSON-serializable
     object. It is stored with the vector and will be returned in search
     results.
     """
     # We will store the normalized vector (used during retrieval)
     nvs = [unitvec(i) for i in vs]
     # Store vector in each bucket of all hashes
     for lshash in self.lshashes:
         bucket_keys = [lshash.hash_vector(i)[0] for i in vs]
         self.storage.store_many_vectors(lshash.hash_name, bucket_keys,
                                         nvs, data)
Ejemplo n.º 8
0
 def store_many_vectors(self, vs, data=None):
     """
     Store a batch of vectors.
     Hashes vector vs and stores them in all matching buckets in the storage.
     The data argument must be either None or a list of JSON-serializable
     object. It is stored with the vector and will be returned in search
     results.
     """
     # We will store the normalized vector (used during retrieval)
     nvs = [unitvec(i) for i in vs]
     # Store vector in each bucket of all hashes
     for lshash in self.lshashes:
         bucket_keys = [lshash.hash_vector(i)[0] for i in vs]
         self.storage.store_many_vectors(lshash.hash_name, bucket_keys, nvs,
                                         data)
Ejemplo n.º 9
0
 def test_retrieval_sparse(self):
     for k in range(100):
         self.engine.clean_all_buckets()
         x = scipy.sparse.rand(1000, 1, density=0.05)
         x_data = 'data'
         self.engine.store_vector(x, x_data)
         n = self.engine.neighbours(x)
         y, y_data, y_distance = n[0]
         normalized_x = unitvec(x)
         delta = 0.000000001
         self.assertAlmostEqual(numpy.abs((normalized_x - y)).max(),
                                0,
                                delta=delta)
         self.assertEqual(y_data, x_data)
         self.assertAlmostEqual(y_distance, 0.0, delta=delta)
Ejemplo n.º 10
0
    def neighbours(self, v):
        """
        Hashes vector v, collects all candidate vectors from the matching
        buckets in storage, applys the (optional) distance function and
        finally the (optional) filter function to construct the returned list
        of either (vector, data, distance) tuples or (vector, data) tuples.
        """

        # Collect candidates from all buckets from all hashes
        candidates = []
        for lshash in self.lshashes:
            for bucket_key in lshash.hash_vector(v, querying=True):
                bucket_content = self.storage.get_bucket(
                    lshash.hash_name, bucket_key)
                #print 'Bucket %s size %d' % (bucket_key, len(bucket_content))
                candidates.extend(bucket_content)

        # print 'Candidate count is %d' % len(candidates)

        # Apply fetch vector filters if specified and return filtered list
        if self.fetch_vector_filters:
            filter_input = candidates
            for fetch_vector_filter in self.fetch_vector_filters:
                filter_input = fetch_vector_filter.filter_vectors(filter_input)
            # Update candidates
            candidates = filter_input

        # Apply distance implementation if specified
        if self.distance:
            # Normalize vector (stored vectors are normalized)
            nv = unitvec(v)
            candidates = [(x[0], x[1], self.distance.distance(x[0], nv))
                          for x in candidates]

        # Apply vector filters if specified and return filtered list
        if self.vector_filters:
            filter_input = candidates
            for vector_filter in self.vector_filters:
                filter_input = vector_filter.filter_vectors(filter_input)
            # Return output of last filter
            return filter_input

        # If there is no vector filter, just return list of candidates
        return candidates
Ejemplo n.º 11
0
    def neighbours(self, v):
        """
        Hashes vector v, collects all candidate vectors from the matching
        buckets in storage, applys the (optional) distance function and
        finally the (optional) filter function to construct the returned list
        of either (vector, data, distance) tuples or (vector, data) tuples.
        """        

        # Collect candidates from all buckets from all hashes
        candidates = []
        for lshash in self.lshashes:
            for bucket_key in lshash.hash_vector(v, querying=True):
                bucket_content = self.storage.get_bucket(lshash.hash_name,
                                                         bucket_key)
                #print 'Bucket %s size %d' % (bucket_key, len(bucket_content))
                candidates.extend(bucket_content)

        # print 'Candidate count is %d' % len(candidates)

        # Apply fetch vector filters if specified and return filtered list
        if self.fetch_vector_filters:
            filter_input = candidates
            for fetch_vector_filter in self.fetch_vector_filters:
                filter_input = fetch_vector_filter.filter_vectors(filter_input)
            # Update candidates
            candidates = filter_input

        # Apply distance implementation if specified 
        if self.distance:            
            # Normalize vector (stored vectors are normalized)
            nv = unitvec(v)
            candidates = [(x[0], x[1], self.distance.distance(x[0], nv)) for x
                            in candidates]

        # Apply vector filters if specified and return filtered list
        if self.vector_filters:
            filter_input = candidates
            for vector_filter in self.vector_filters:
                filter_input = vector_filter.filter_vectors(filter_input)
            # Return output of last filter
            return filter_input

        # If there is no vector filter, just return list of candidates
        return candidates
 def __vector_to_string(self, vector):
     """ Returns string representation of vector. """
     return numpy.array_str(numpy.round(unitvec(vector), decimals=3))
Ejemplo n.º 13
0
 def __vector_to_string(self, vector):
     """ Returns string representation of vector. """
     return numpy.array_str(numpy.round(unitvec(vector), decimals=3))