def index_user_vectors():
	
	print 'Performing indexing with HashPermutations...'
	
	global engine_perm 
	
	t0 = time.time()
	
	print k_dimen, d_dimen
	
	rbp_perm = RandomBinaryProjections('rbp_perm', d_dimen)
	
	rbp_perm.reset(k_dimen)
	
	# Create permutations meta-hash
	permutations = HashPermutations('permut')
	
	rbp_conf = {'num_permutation':50,'beam_size':10,'num_neighbour':250}
	
        # Add rbp as child hash of permutations hash
	permutations.add_child_hash(rbp_perm, rbp_conf)
	
        # Create engine
        engine_perm = Engine(k_dimen, lshashes=[permutations], distance=CosineDistance())
    
	for u in user_vector:
		
		engine_perm.store_vector(user_vector[u], data=u)
		
	 # Then update permuted index
        permutations.build_permuted_index()
    
	t1 = time.time()
	
	print 'Indexing took %f seconds', (t1-t0)
def index_user_vectors():

    #print 'Performing indexing with HashPermutations...'

    global engine_perm

    t0 = time.time()

    #print k_dimen, d_dimen

    rbp_perm = RandomBinaryProjections('rbp_perm', d_dimen)

    rbp_perm.reset(k_dimen)

    # Create permutations meta-hash
    permutations = HashPermutations('permut')

    rbp_conf = {'num_permutation': 50, 'beam_size': 10, 'num_neighbour': 250}

    # Add rbp as child hash of permutations hash
    permutations.add_child_hash(rbp_perm, rbp_conf)

    # Create engine
    engine_perm = Engine(k_dimen,
                         lshashes=[permutations],
                         distance=CosineDistance())

    for u in user_vector:

        engine_perm.store_vector(user_vector[u], data=u)

    # Then update permuted index
    permutations.build_permuted_index()

    t1 = time.time()
Example #3
0
    def __init__(self, feature_file, dimension, neighbour, lsh_project_num):
        self.feature_file = feature_file
        self.dimension = dimension
        self.neighbour = neighbour
        self.face_feature = defaultdict(str)
        self.ground_truth = defaultdict(int)

        # Create permutations meta-hash
        permutations2 = HashPermutationMapper('permut2')

        tmp_feature = defaultdict(str)
        with open(feature_file, 'rb') as f:
            reader = csv.reader(f, delimiter=' ')
            for name, feature in reader:
                tmp_feature[name] = feature

        matrix = []
        label = []
        for item in tmp_feature.keys():
            v = map(float, tmp_feature[item].split(','))
            matrix.append(np.array(v))
            label.append(item)
        random.shuffle(matrix)
        print 'PCA matric : ', len(matrix)

        rbp_perm2 = PCABinaryProjections('testPCABPHash', lsh_project_num,
                                         matrix)
        permutations2.add_child_hash(rbp_perm2)

        # Create engine
        nearest = NearestFilter(self.neighbour)
        self.engine = Engine(self.dimension,
                             lshashes=[permutations2],
                             distance=CosineDistance(),
                             vector_filters=[nearest])
def LSH(Layers, K):

    lsh_vectors = database[:, LSH_VECT_START_COL:]
    video_data = database[:, 0:5]

    num_rows, num_cols = lsh_vectors.shape
    dimension = num_cols

    rbp = list()
    for i in range(Layers):
        rbp.append(RandomBinaryProjections(str(i), K))

    # Create engine with pipeline configuration
    engine = Engine(dimension, lshashes=rbp)

    # Index 1000000 random vectors (set their data zo a unique string)
    for index in range(num_rows):
        v = lsh_vectors[index, :]

        meta_data = str(index)+',' + str(int(video_data[index, 0])) + ', ' + str(int(video_data[index, 1])) + ', ' + str(int(video_data[index, 2])) \
                    + ', ' + str(video_data[index, 3]) + ', ' + str(video_data[index, 4])

        engine.store_vector(v, meta_data)

    printOutput(engine.storage.buckets)

    print 'stop'
Example #5
0
def knn(data,k):
    assert k<=len(data)-1, 'The number of neighbors must be smaller than the data cardinality (minus one)'
    k=k+1
    n,dimension = data.shape
    ind = []
    dist = []
    

    if(dimension<10):
        rbp = RandomBinaryProjections('rbp', dimension)
    else:
        rbp = RandomBinaryProjections('rbp',10)
        
    engine = Engine(dimension, lshashes=[rbp], vector_filters=[NearestFilter(k)])

    for i in range(n):
        engine.store_vector(data[i], i)
    
    
    for i in range(n):
     
        N = engine.neighbours(data[i])
        ind.append([x[1] for x in N][1:])
        dist.append([x[2] for x in N][1:])
        
  
    return N,dist,ind
Example #6
0
class StateDBEngine(object):
    def __init__(self):
        # initialize "nearby" library
        self.dim = 4
        self.rbp = RandomBinaryProjections('rbp', 100)
        self.engine = Engine(self.dim, lshashes=[self.rbp])
        # performance counter
        self.counter = 0

    def add(self, x, data):
        # print 'add data = ', data
        self.engine.store_vector(x, data)
        self.counter += 1

    def lookup(self, x, THRESHOLD=0.1):
        naver = self.engine.neighbours(x)
        if len(naver) == 0:
            return None

        pt, data, d = naver[0]
        # print 'lhs, rhs', x, pt,
        # print 'd = ', d, (d < THRESHOLD), (data is None)
        if d < THRESHOLD:
            return data
        else:
            return None
Example #7
0
def index_in_text_engine(nid_gen,
                         tfidf,
                         lsh_projections,
                         tfidf_is_dense=False):
    num_features = tfidf.shape[1]
    print("TF-IDF shape: " + str(tfidf.shape))
    text_engine = Engine(num_features,
                         lshashes=[lsh_projections],
                         distance=CosineDistance())

    st = time.time()
    row_idx = 0
    for key in nid_gen:
        if tfidf_is_dense:
            dense_row = tfidf[row_idx]
            array = dense_row
        else:
            sparse_row = tfidf.getrow(row_idx)
            dense_row = sparse_row.todense()
            array = dense_row.A[0]
        row_idx += 1
        text_engine.store_vector(array, key)
    et = time.time()
    print("Total index text: " + str((et - st)))
    return text_engine
    def setUp(self):
        logging.basicConfig(level=logging.WARNING)
        numpy.random.seed(11)

        # Create permutations meta-hash
        self.permutations = HashPermutations('permut')

        # Create binary hash as child hash
        rbp = RandomBinaryProjections('rbp1', 4, rand_seed=19)
        rbp_conf = {
            'num_permutation': 50,
            'beam_size': 10,
            'num_neighbour': 100
        }

        # Add rbp as child hash of permutations hash
        self.permutations.add_child_hash(rbp, rbp_conf)

        # Create engine with meta hash and cosine distance
        self.engine_perm = Engine(200,
                                  lshashes=[self.permutations],
                                  distance=CosineDistance())

        # Create engine without permutation meta-hash
        self.engine = Engine(200, lshashes=[rbp], distance=CosineDistance())
Example #9
0
    def test_random_discretized_projections(self):
        dim = 4
        vector_count = 5000
        vectors = numpy.random.randn(dim, vector_count)

        # First get recall and precision for one 1-dim random hash
        rdp = RandomDiscretizedProjections('rdp', 1, 0.01)
        nearest = NearestFilter(10)
        engine = Engine(dim, lshashes=[rdp], vector_filters=[nearest])
        exp = RecallPrecisionExperiment(10, vectors)
        result = exp.perform_experiment([engine])

        recall1 = result[0][0]
        precision1 = result[0][1]
        searchtime1 = result[0][2]

        print('\nRecall RDP: %f, Precision RDP: %f, SearchTime RDP: %f\n' % \
            (recall1, precision1, searchtime1))

        # Then get recall and precision for one 4-dim random hash
        rdp = RandomDiscretizedProjections('rdp', 2, 0.2)
        engine = Engine(dim, lshashes=[rdp], vector_filters=[nearest])
        result = exp.perform_experiment([engine])

        recall2 = result[0][0]
        precision2 = result[0][1]
        searchtime2 = result[0][2]

        print('\nRecall RDP: %f, Precision RDP: %f, SearchTime RDP: %f\n' % \
            (recall2, precision2, searchtime2))

        # Many things are random here, but the precision should increase
        # with dimension
        self.assertTrue(precision2 > precision1)
Example #10
0
class LSH:
    def __init__(self, path, dataSize):
        self.path = path
        self.dataSize = dataSize

    def preprocess(self):
        ids = []
        meta = []
        data = []

        for i in range(self.dataSize):
            with open(self.path + str(i) + ".data", "rb") as file:
                f_song_id = pickle.load(file)
                f_songMeta = pickle.load(file)
                f_data = pickle.load(file)
                ids.append(f_song_id)
                meta.append(f_songMeta)
                data.append(f_data)

        self.id = np.array(ids)
        self.meta = np.array(meta)
        self.data = np.array(data)

    def generate_hashtable(self):
        self.engine = Engine(self.data.shape[1],
                             lshashes=[RandomBinaryProjections('rbp', 20)])

        for i in range(self.dataSize):
            self.engine.store_vector(self.data[i], data=self.id[i])

    def query(self, data):
        return self.engine.neighbours(data)
Example #11
0
	def loadHashmap(self, feature_size, result_n):
		# Create redis storage adapter
		redis_object = Redis(host='localhost', port=6379, db=0)
		redis_storage = RedisStorage(redis_object)
		pdb.set_trace()
		try:
			# Get hash config from redis
			config = redis_storage.load_hash_configuration('test')
			# Config is existing, create hash with None parameters
			lshash = RandomBinaryProjections(None, None)
			# Apply configuration loaded from redis
			lshash.apply_config(config)
			
		except:
			# Config is not existing, create hash from scratch, with 10 projections
			lshash = RandomBinaryProjections('test', 0)
			

		# Create engine for feature space of 100 dimensions and use our hash.
		# This will set the dimension of the lshash only the first time, not when
		# using the configuration loaded from redis. Use redis storage to store
		# buckets.
		nearest = NearestFilter(1000)
		#self.engine = Engine(feature_size, lshashes=[], vector_filters=[])
		pdb.set_trace()
		self.engine = Engine(192, lshashes=[lshash], vector_filters=[nearest], storage=redis_storage, distance=EuclideanDistance())

		# Do some stuff like indexing or querying with the engine...

		# Finally store hash configuration in redis for later use
		redis_storage.store_hash_configuration(lshash)
Example #12
0
def knn(data, k):
    assert k <= len(
        data
    ) - 1, 'The number of neighbors must be smaller than the data cardinality (minus one)'
    k = k + 1
    n, dimension = data.shape
    ind = []
    dist = []

    if (dimension < 10):
        rbp = RandomBinaryProjections('rbp', dimension)
    else:
        rbp = RandomBinaryProjections('rbp', 10)

    engine = Engine(dimension,
                    lshashes=[rbp],
                    vector_filters=[NearestFilter(k)])

    for i in range(n):
        engine.store_vector(data[i], i)

    for i in range(n):

        N = engine.neighbours(data[i])
        ind.append([x[1] for x in N][1:])
        dist.append([x[2] for x in N][1:])

    return N, dist, ind
Example #13
0
        def RunAnnNearpy(q):
            totalTimer = Timer()

            # Load input dataset.
            Log.Info("Loading dataset", self.verbose)
            queryData = np.genfromtxt(self.dataset[1], delimiter=',')
            train, label = SplitTrainData(self.dataset)

            with totalTimer:
                # Get all the parameters.
                try:
                    # Perform Approximate Nearest-Neighbors
                    dimension = train.shape[1]
                    rbp = RandomBinaryProjections('rbp', 10)
                    engine = Engine(dimension, lshashes=[rbp])
                    for i in range(len(train)):
                        engine.store_vector(train[i], 'data_%d' % i)
                    for i in range(len(queryData)):
                        v = engine.neighbours(queryData[i])
                except Exception as e:
                    Log.Info(e)
                    q.put(e)
                    return -1
            time = totalTimer.ElapsedTime()
            q.put(time)
            return time
def test_nearpy(X_train, y_train, X_test, k):
    # We are looking for the k closest neighbours
    nearest = NearestFilter(k)
    X_train_normalized = []
    for i in range(len(X_train)):
        train_example = X_train[i]
        element = ((train_example / np.linalg.norm(train_example)).tolist(),
                   y_train[i].tolist())
        X_train_normalized.append(element)

    engine = Engine(X_train.shape[1],
                    lshashes=[RandomBinaryProjections('default', 10)],
                    distance=CosineDistance(),
                    vector_filters=[nearest])

    #perform hashing for train examples
    for train_example in X_train:
        engine.store_vector(train_example)

    labels = []
    for test_example in X_test:
        neighbors = engine.neighbours(test_example)
        labels.append([
            train_example[1] for train_example in X_train_normalized
            if set(neighbors[0][0]) == set(train_example[0])
        ])
    return labels
Example #15
0
class TestEngine(unittest.TestCase):

    def setUp(self):
        self.engine = Engine(1000)

    def test_retrieval(self):
        for k in range(100):
            self.engine.clean_all_buckets()
            x = numpy.random.randn(1000)
            x_data = 'data'
            self.engine.store_vector(x, x_data)
            n = self.engine.neighbours(x)
            y = n[0][0]
            y_data = n[0][1]
            y_distance = n[0][2]
            self.assertTrue((y == x).all())
            self.assertEqual(y_data, x_data)
            self.assertEqual(y_distance, 0.0)

    def test_retrieval_sparse(self):
        for k in range(100):
            self.engine.clean_all_buckets()
            x = scipy.sparse.rand(1000, 1, density=0.05)
            x_data = 'data'
            self.engine.store_vector(x, x_data)
            n = self.engine.neighbours(x)
            y = n[0][0]
            y_data = n[0][1]
            y_distance = n[0][2]
            self.assertTrue((y - x).sum() == 0.0)
            self.assertEqual(y_data, x_data)
            self.assertEqual(y_distance, 0.0)
Example #16
0
class PointCalculator():
    def __init__(self, point_list, point):
        self.__configure_calculator(point_list, point)

    def __configure_calculator(self, point_list, point):
        # Dimension of our vector space
        self.__dimension__ = 2

        # Create a random binary hash with 10 bits
        self.__rbp__ = RandomBinaryProjections('rbp', 10)

        # Create engine with pipeline configuration
        self.__engine__ = Engine(self.__dimension__, lshashes=[self.__rbp__])
        self.set_searching_point_list(point_list)
        self.set_query_point(point)

    def __load_point_list_in_engine(self):
        for index in xrange(0, len(self.__point_list__)):
            v = numpy.array(self.__point_list__[index])
            self.__engine__.store_vector(v, 'data_%d' % index)

    def set_searching_point_list(self, point_list):
        self.__point_list__ = point_list
        self.__load_point_list_in_engine()

    def set_query_point(self, point):
        self.__point__ = point

    def __get_nearest_point(self):
        return self.__engine__.neighbours(numpy.array(self.__point__))

    def get_nearest_point_array_coords(self):
        nearest_point = self.__get_nearest_point()
        return [nearest_point[0][0][0], nearest_point[0][0][1]]
Example #17
0
    def test_retrieval(self):
        # We want 12 projections, 20 results at least
        rbpt = RandomBinaryProjectionTree('testHash', 12, 20)

        # Create engine for 100 dimensional feature space, do not forget to set
        # nearest filter to 20, because default is 10
        self.engine = Engine(100,
                             lshashes=[rbpt],
                             vector_filters=[NearestFilter(20)])

        # First insert 200000 random vectors
        #print 'Indexing...'
        for k in range(200000):
            x = numpy.random.randn(100)
            x_data = 'data'
            self.engine.store_vector(x, x_data)

        # Now do random queries and check result set size
        #print 'Querying...'
        for k in range(10):
            x = numpy.random.randn(100)
            n = self.engine.neighbours(x)
            #print "Candidate count = %d" % self.engine.candidate_count(x)
            #print "Result size = %d" % len(n)
            self.assertEqual(len(n), 20)
Example #18
0
class RandomBinaryNN(NearestNeighbor):
    """ Nearest neighbor implementation by using random binary trees from nearpy package """
    def __init__(self, dimension: int, number_projections: int,
                 threshold: float):
        """
        :param dimension:
            Number of dimensions of input points
        :param number_projections:
            Number of random projections used for finding nearest neighbors.
            Trade-off: More projections result in a smaller number of false positives in candidate set
        :param threshold:
            Distance threshold for definition nearest: all points within this specific distance
        """
        self.rbp = RandomBinaryProjections('rbp', number_projections)
        self.sqdist = SquaredEuclideanDistance()
        self.ann_engine = Engine(
            dimension,
            lshashes=[self.rbp],
            distance=self.sqdist,
            vector_filters=[DistanceThresholdFilter(threshold)])

    def insert_candidate(self, point: np.ndarray, metadata):
        self.ann_engine.store_vector(point, data=metadata)

    def get_candidates(self, point: np.ndarray):
        return [
            NearestNeighborResult(res[0], res[1], res[2])
            for res in self.ann_engine.neighbours(point)
        ]
Example #19
0
 def __init__(self, num_features, projection_count=30):
     self.num_features = num_features
     #self.rbp = RandomDiscretizedProjections('default', projection_count, bin_width=100)
     self.rbp = RandomBinaryProjections('default', projection_count)
     #self.rbp = RandomBinaryProjectionTree('default', projection_count, 1)
     self.text_engine = Engine(num_features,
                               lshashes=[self.rbp],
                               distance=CosineDistance())
Example #20
0
def main(args):
    """ Main entry.
    """

    data = Dataset(args.dataset)
    num, dim = data.base.shape

    # We are looking for the ten closest neighbours
    nearest = NearestFilter(args.topk)
    # We want unique candidates
    unique = UniqueFilter()

    # Create engines for all configurations
    for nbit, ntbl in itertools.product(args.nbits, args.ntbls):
        logging.info("Creating Engine ...")
        lshashes = [RandomBinaryProjections('rbp%d' % i, nbit)
                    for i in xrange(ntbl)]

        # Create engine with this configuration
        engine = Engine(dim, lshashes=lshashes,
                        vector_filters=[unique, nearest])
        logging.info("\tDone!")

        logging.info("Adding items ...")
        for i in xrange(num):
            engine.store_vector(data.base[i, :], i)
            if i % 100000 == 0:
                logging.info("\t%d/%d" % (i, data.nbae))
        logging.info("\tDone!")

        ids = np.zeros((data.nqry, args.topk), np.int)
        logging.info("Searching ...")
        tic()
        for i in xrange(data.nqry):
            reti = [y for x, y, z in
                    np.array(engine.neighbours(data.query[i]))]
            ids[i, :len(reti)] = reti
            if i % 100 == 0:
                logging.info("\t%d/%d" % (i, data.nqry))
        time_costs = toc()
        logging.info("\tDone!")

        report = os.path.join(args.exp_dir, "report.txt")
        with open(report, "a") as rptf:
            rptf.write("*" * 64 + "\n")
            rptf.write("* %s\n" % time.asctime())
            rptf.write("*" * 64 + "\n")

        r_at_k = compute_stats(data.groundtruth, ids, args.topk)[-1][-1]

        with open(report, "a") as rptf:
            rptf.write("=" * 64 + "\n")
            rptf.write("index_%s-nbit_%d-ntbl_%d\n" % ("NearPy", nbit, ntbl))
            rptf.write("-" * 64 + "\n")
            rptf.write("recall@%-8d%.4f\n" % (args.topk, r_at_k))
            rptf.write("time cost (ms): %.3f\n" %
                       (time_costs * 1000 / data.nqry))
Example #21
0
    def __init__(self, x):
        self.n, self.f = x.shape
        # Use NearPy lsh for fast ann
        rbp = RandomBinaryProjections('rbp', 10)

        self.engine = Engine(self.f, lshashes=[rbp])
        for i in np.arange(self.n):
            v = x[i, :]
            self.engine.store_vector(v, i)
Example #22
0
def k_nn_lsh_2(k, word, decade_matrix, index_dict):
    num_rows = decade_matrix.get_shape()[0]
    print("the number of rows:" + str(num_rows))
    rbp = RandomBinaryProjections('rbp', 256)
    engine = Engine(num_rows, lshashes=[rbp])
    for i in range(num_rows):
        print(i)

        engine.store_vector(decade_matrix.getrow(i), "data_%d" % i)
    return engine.neighbours(word)
Example #23
0
    def fit(self, X):
        b = self.params['b']
        self.n, self.f = X.shape
        # Use NearPy lsh for fast ann
        rbp = RandomBinaryProjections('rbp', b)

        self.engine = Engine(self.f, lshashes=[rbp])
        for i in np.arange(self.n):
            v = np.squeeze(np.copy(X[i, :]))
            self.engine.store_vector(v, i)
Example #24
0
class CFiltering:
    def __init__(self,
                 matrix,
                 max_neighbours=20,
                 lshashes=[RandomBinaryProjections("rbp", 10)],
                 vector_filters=[UniqueFilter()],
                 distance=Pearson()):

        if not isinstance(lshashes, list):

            raise TypeError("'lshashes' must be an instance of 'list'")

        if not isinstance(vector_filters, list):

            raise TypeError("'vector_filters' must be an instance of 'list'")

        self.underlying = Engine(len(matrix[0]),
                                 lshashes=lshashes,
                                 vector_filters=vector_filters +
                                 [NearestFilter(max_neighbours)],
                                 distance=distance)

        for vector in matrix:

            self.underlying.store_vector(vector)

    def predict(self, vector, precision):

        neighbours = self.underlying.neighbours(vector)

        if not neighbours:

            raise ValueError("Failed to acquire any neighbours")

        average = [
            sum(neighbour) / len(neighbour) for neighbour, _, _ in neighbours
        ]

        avg = sum(vector) / len(vector)

        for i in range(len(vector)):

            if vector[i] < precision:

                weighted_sum = 0

                for j, neighbour in enumerate(neighbours):

                    neighbour, _, similarity = neighbour

                    weighted_sum += similarity * (neighbour[j] - average[j])

                vector[i] = avg + weighted_sum / len(vector)

        return vector
class GenerateHashTable():
    def __init__(self,
                 measure="EuclideanDistance",
                 data_path='data/classed_data/'):
        self.res = ResnetSimilarity()
        self.pbar = ProgressBar()
        # Dimension of our vector space
        self.dimension = 2048
        self.data_path = data_path

        # Create a random binary hash with 10 bits
        self.rbp = RandomBinaryProjections('rbp', 10)

        self.measure = measure
        self.msote = MemoryStorage()
        if measure == "EuclideanDistance":
            self.engine = Engine(self.dimension,
                                 lshashes=[self.rbp],
                                 storage=self.msote,
                                 distance=EuclideanDistance())
        else:
            self.engine = Engine(self.dimension,
                                 lshashes=[self.rbp],
                                 storage=self.msote,
                                 distance=CosineDistance())

    def generate_table(self):
        if self.measure == "CosineDistance":
            save_path = "hashed_objects/hashed_object_Cosine.pkl"
        elif self.measure == "EuclideanDistance":
            save_path = "hashed_objects/hashed_object_euclidean.pkl"
        else:
            save_path = "hashed_objects/" + str(self.measure) + ".pkl"

        count = 0
        for subdir, dirs, files in os.walk(self.data_path):
            for file in files:
                if '.jpg' in file:
                    img_path = os.path.join(subdir, file)
                    img = Image.open(img_path).convert('RGB')

                    if img.size[0] >= 100:
                        img_emb = self.res.getMapping(img)
                        img_emb = img_emb.view(-1, 2048)
                        img_emb = img_emb.numpy()

                        self.engine.store_vector(img_emb[0], img_path)
                        if count % 1000 == 0:
                            print("Saving  Image Embedding ", count)
                        count += 1

        print("Saving File To", save_path)
        # TODO this is peculiar
        filehandler = open(save_path, 'wb')
        pickle.dump(self.engine, filehandler)
 def __init__(self, emb_path, feature='title'):
     self.emb_path = emb_path
     self.feature = feature
     self.data_df = None
     self.tfidf = Vectorizer(**get_tfidf_params())
     self.fasttext_embedder = None
     self.fasttext_tfidf = None
     self.dimension = 300
     rbp = RandomBinaryProjections('rbp', 2)
     self.engine = Engine(self.dimension, lshashes=[rbp])
     pass
Example #27
0
    def __configure_calculator(self, point_list, point):
        # Dimension of our vector space
        self.__dimension__ = 2

        # Create a random binary hash with 10 bits
        self.__rbp__ = RandomBinaryProjections('rbp', 10)

        # Create engine with pipeline configuration
        self.__engine__ = Engine(self.__dimension__, lshashes=[self.__rbp__])
        self.set_searching_point_list(point_list)
        self.set_query_point(point)
Example #28
0
 def get_engine(self, vocab, vecs):
     logging.info('{} hash functions'.format(self.args.projections))
     hashes = [PCABinaryProjections('ne1v', self.args.projections, vecs[:1000,:].T)]
     engine = Engine(vecs.shape[1], lshashes=hashes, distance=[],
                     vector_filters=[])
     for ind, vec in enumerate(vecs):
         if not ind % 100000:                
             logging.info( 
                 '{} words added to nearpy engine'.format(ind))
         engine.store_vector(vec, ind)
     return engine 
Example #29
0
    def test_storage_issue(self):
        engine1 = Engine(100)
        engine2 = Engine(100)

        for k in range(1000):
            x = numpy.random.randn(100)
            x_data = 'data'
            engine1.store_vector(x, x_data)

        # Each engine should have its own default storage
        self.assertTrue(len(engine2.storage.buckets) == 0)
    def build_index(self, X):
        f = X.shape[1]
        n = X.shape[0]

        rbp = RandomBinaryProjections('rbp', 32)
        engine = Engine(f, lshashes=[rbp])

        for i in range(n):
            engine.store_vector(X[i], 'data_%d' % i)

        return engine
Example #31
0
    def test_storage_issue(self):
        engine1 = Engine(100)
        engine2 = Engine(100)

        for k in range(1000):
            x = numpy.random.randn(100)
            x_data = 'data'
            engine1.store_vector(x, x_data)

        # Each engine should have its own default storage
        self.assertTrue(len(engine2.storage.buckets)==0)
Example #32
0
    def build_index(self, X):
        f = X.shape[1]
        n = X.shape[0]

        rbp = RandomBinaryProjections('rbp', 32)
        engine = Engine(f, lshashes=[rbp])

        for i in range(n):
            engine.store_vector(X[i], 'data_%d' % i)

        return engine
Example #33
0
 def __init__(self, data_points, sim_threshold=0.5, num_vectors=3):
     self.data_points = data_points
     self.point_num = self.data_points.shape[0]
     self.dimension = self.data_points.shape[1] - 1
     # Create a random binary hash with . bits
     self.rbp = RandomBinaryProjections('rbp', num_vectors, rand_seed=42)
     self.engine = Engine(
         self.dimension,
         lshashes=[self.rbp],
         vector_filters=[DistanceThresholdFilter(1 - sim_threshold)])
     for i in range(self.point_num):
         self.engine.store_vector(self.data_points[i, 1:], '%d' % i)
Example #34
0
class Neighbors:
    """ Nearest neighbors. """
    def __init__(self, config, verbose=True, log_file=None):
        # set up logger
        self._logger = Logger.get_logger(self.__class__.__name__,
                                         log_file=log_file,
                                         silence=(not verbose),
                                         global_log_file=verbose)

        # read config
        self._parse_config(config)

        self._engine = None

    def _parse_config(self, config):
        self._num_neighbors = config["num_neighbors"]

    def _build_engine(self, dimension):
        # build NearPy engine
        self._logger.info("Building engine...")
        self._engine = Engine(
            dimension, vector_filters=[NearestFilter(self._num_neighbors)])

    def store(self, vectors, data=None, log_freq=10, verbose=True):
        self._logger.info("Storing vectors...")
        if data is not None:
            assert vectors.shape[0] == len(
                data), "Dim 0 of vectors and data must match!"

        if self._engine is None:
            self._build_engine(vectors.shape[-1])

        num_vectors = vectors.shape[0]
        for idx in xrange(num_vectors):
            if verbose and idx % log_freq == 0:
                self._logger.info("Storing vector {} of {}...".format(
                    idx, num_vectors))
            if data is not None:
                self._engine.store_vector(vectors[idx], data[idx])
            else:
                self._engine.store_vector(vectors[idx])

    def predict(self, vectors, log_freq=10, verbose=True):
        self._logger.info("Predicting...")

        num_vectors = vectors.shape[0]
        neighbors = []
        for idx in xrange(num_vectors):
            if verbose and idx % log_freq == 0:
                self._logger.info("Predicting vector {} of {}...".format(
                    idx, num_vectors))
            neighbors.append(self._engine.neighbours(vectors[idx]))
        return neighbors
Example #35
0
    def _create_engine(self, k, lshashes=None):
        self.k_ = k
        self.engine_ = Engine(self.dimension_,
                              lshashes,
                              distance=self.dist_metric_,
                              vector_filters=[NearestFilter(k)])

        for i, feature in enumerate(self.featurized_):
            if self.transpose_:
                self.engine_.store_vector(feature.T, i)
            else:
                self.engine_.store_vector(feature, i)
 def fit(self, X, y=None, hash="randbinary"):
     X = np.array(X)
     assert len(X.shape) == 2, "X not 2-rank"
     dimension = X.shape[-1]
     if hash == "randbinary":
         rbp = RandomBinaryProjections('rbp', 10)
     elif hash == "pcabinary":
         rbp = PCABinaryProjections('rbp', 10, training_set=X)
     self.engine = Engine(dimension, lshashes=[rbp])
     index = 0
     for x in X:
         self.engine.store_vector(x, str(index))
         index += 1
Example #37
0
class LSHIndex(Index):

    def __init__(self, hasher, number_of_tables=6, length_of_tables=12, match_thresh=0.2, association_thresh=0.1, storage=memoryStorage):
        """
        :param hasher:
        @type hasher: Hasher
        """
        Index.__init__(self, hasher,
                       number_of_tables=number_of_tables,
                       length_of_tables=length_of_tables,
                       match_thresh=match_thresh,
                       association_thresh=association_thresh)
        self.hasher = hasher
        self.match_thresh = match_thresh
        self.association_thresh = association_thresh
        self.tables = [None]*number_of_tables
        for i in range(number_of_tables):
            self.tables[i] = RandomBinaryProjections(str(i), length_of_tables)
        self.engine = Engine(self.hasher.dims(),
                             lshashes=self.tables,
                             storage=storage(),
                             fetch_vector_filters=[NoVectorFilter()])

    def index(self, id, img):
        item = self.hasher.hash(id, img)
        for i in range(len(item.descriptors)):
            self.engine.store_vector(item.descriptors[i],data=(id, item.keypoints[i], item.descriptors[i]))
        return item

    def find(self, id, img, index_if_not_found=False):
        item = self.hasher.hash(id, img)
        matches = {}
        #count_min =self.association_thresh * float(len(item.descriptors))
        for x in item.descriptors:
            for neighbour in self.engine.neighbours(x):
                if neighbour[1][0] in matches:
                    continue
                y = neighbour[1][2]
                dist = l2norm(x, y)
                key = neighbour[1][0]
                if dist < self.match_thresh:
                    #if dist > 0.0001:
                    #    print('{} {} {}'.format(id, neighbour[1][0], dist))
                    matches[key] = (matches[key] + 1) if key in matches else 1
        if id not in matches and index_if_not_found:
            for i in range(len(item.descriptors)):
                self.engine.store_vector(item.descriptors[i], data=(id, item.keypoints[i], item.descriptors[i]))
        #for id, count in matches.items():
        #    #if count >= count_min:
        #    yield id
        return list(matches.keys())
Example #38
0
    def _build_rdp_engine(self,matrix,rdp,normals):
        # Dimension of our vector space
        dimension = np.shape(matrix)[1]
        n = np.shape(matrix)[0]
        # Create a random binary hash with 10 bits

        # Create engine with pipeline configuration
        engine = Engine(dimension, lshashes=[rdp],storage = MemoryStorage())
        rdp.vectors = normals

        for index in range(n):
            v = matrix[index]
            engine.store_vector(v, '%d' % index)
            
        return engine
    def test_storage_memory(self):
        # We want 10 projections, 20 results at least
        rbpt = RandomBinaryProjectionTree('testHash', 10, 20)

        # Create engine for 100 dimensional feature space
        self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)])

        # First insert 2000 random vectors
        for k in range(2000):
            x = numpy.random.randn(100)
            x_data = 'data'
            self.engine.store_vector(x, x_data)

        self.memory.store_hash_configuration(rbpt)

        rbpt2 = RandomBinaryProjectionTree(None, None, None)
        rbpt2.apply_config(self.memory.load_hash_configuration('testHash'))

        self.assertEqual(rbpt.dim, rbpt2.dim)
        self.assertEqual(rbpt.hash_name, rbpt2.hash_name)
        self.assertEqual(rbpt.projection_count, rbpt2.projection_count)

        for i in range(rbpt.normals.shape[0]):
            for j in range(rbpt.normals.shape[1]):
                self.assertEqual(rbpt.normals[i, j], rbpt2.normals[i, j])

        # Now do random queries and check result set size
        for k in range(10):
            x = numpy.random.randn(100)
            keys1 = rbpt.hash_vector(x, querying=True)
            keys2 = rbpt2.hash_vector(x, querying=True)
            self.assertEqual(len(keys1), len(keys2))
            for k in range(len(keys1)):
                self.assertEqual(keys1[k], keys2[k])
Example #40
0
    def __init__(self, feature_file, dimension, neighbour, lsh_project_num):
        self.feature_file = feature_file
        self.dimension = dimension
        self.neighbour = neighbour
        self.face_feature = defaultdict(str)
        self.ground_truth = defaultdict(int)

        # Create permutations meta-hash
        permutations2 = HashPermutationMapper('permut2')

        tmp_feature = defaultdict(str)
        with open(feature_file, 'rb') as f:
            reader = csv.reader(f, delimiter=' ')
            for name, feature in reader:
                tmp_feature[name] = feature

        matrix = []
        label = []
        for item in tmp_feature.keys():
            v = map(float, tmp_feature[item].split(','))
            matrix.append(np.array(v))
            label.append(item)
        random.shuffle(matrix)
        print 'PCA matric : ', len(matrix)

        rbp_perm2 = PCABinaryProjections('testPCABPHash', lsh_project_num, matrix)
        permutations2.add_child_hash(rbp_perm2)

        # Create engine
        nearest = NearestFilter(self.neighbour)
        self.engine = Engine(self.dimension, lshashes=[permutations2], distance=CosineDistance(), vector_filters=[nearest])
Example #41
0
 def __init__(self):
     # initialize "nearby" library
     self.dim = 4
     self.rbp = RandomBinaryProjections('rbp', 100)
     self.engine = Engine(self.dim, lshashes=[self.rbp])
     # performance counter
     self.counter = 0
Example #42
0
	def load_DL(self,vector_set):
		rbp = RandomBinaryProjections('rbp',10)
		self.engine_ = Engine(self.biggest, lshashes=[rbp])
		for i in range(len(list(self.training_))):
			vector=vector_set[:,i]
			vector=np.reshape(vector,(self.biggest,1))
			vector=self.DL_[-1].transform(vector)
			self.engine_.store_vector(vector[:,0],self.training_[i])		
Example #43
0
	def load_KPCA(self,vector_set):
		rbp = RandomBinaryProjections('rbp',10)
		self.engine_ = Engine(self.KPCA_.alphas_.shape[1], lshashes=[rbp])
                transformed_vectors = self.KPCA_.transform(vector_set.T)
		for i in range(len(list(self.training_))):
			#vector=vector_set[:,i]
			#vector=np.reshape(vector,(self.biggest,1))
			#vector=self.KPCA_.transform(vector)
			self.engine_.store_vector(transformed_vectors[i,:], self.training_[i])
Example #44
0
    def setUp(self):
        logging.basicConfig(level=logging.WARNING)

        # Create permutations meta-hash
        self.permutations = HashPermutations('permut')

        # Create binary hash as child hash
        rbp = RandomBinaryProjections('rbp1', 4)
        rbp_conf = {'num_permutation':50,'beam_size':10,'num_neighbour':100}

        # Add rbp as child hash of permutations hash
        self.permutations.add_child_hash(rbp, rbp_conf)

        # Create engine with meta hash and cosine distance
        self.engine_perm = Engine(200, lshashes=[self.permutations], distance=CosineDistance())

        # Create engine without permutation meta-hash
        self.engine = Engine(200, lshashes=[rbp], distance=CosineDistance())
Example #45
0
def test_sparse():
    dim = 500
    num_train = 1000
    num_test = 1
    train_data = ss.rand(dim, num_train)#pickle.load('/home/jmahler/Downloads/feature_objects.p')
    test_data = ss.rand(dim, num_test)

    rbp = RandomBinaryProjections('rbp', 10)
    engine = Engine(dim, lshashes=[rbp])

    for i in range(num_train):
        engine.store_vector(train_data.getcol(i))

    for j in range(num_test):
        N = engine.neighbours(test_data.getcol(j))
        print N

    IPython.embed()
Example #46
0
class TestEngine(unittest.TestCase):

    def setUp(self):
        self.engine = Engine(1000)

    def test_storage_issue(self):
        engine1 = Engine(100)
        engine2 = Engine(100)

        for k in range(1000):
            x = numpy.random.randn(100)
            x_data = 'data'
            engine1.store_vector(x, x_data)

        # Each engine should have its own default storage
        self.assertTrue(len(engine2.storage.buckets)==0)

    def test_retrieval(self):
        for k in range(100):
            self.engine.clean_all_buckets()
            x = numpy.random.randn(1000)
            x_data = 'data'
            self.engine.store_vector(x, x_data)
            n = self.engine.neighbours(x)
            y, y_data, y_distance  = n[0]
            normalized_x = unitvec(x)
            delta = 0.000000001
            self.assertAlmostEqual(numpy.abs((normalized_x - y)).max(), 0, delta=delta)
            self.assertEqual(y_data, x_data)
            self.assertAlmostEqual(y_distance, 0.0, delta=delta)

    def test_retrieval_sparse(self):
        for k in range(100):
            self.engine.clean_all_buckets()
            x = scipy.sparse.rand(1000, 1, density=0.05)
            x_data = 'data'
            self.engine.store_vector(x, x_data)
            n = self.engine.neighbours(x)
            y, y_data, y_distance = n[0]
            normalized_x = unitvec(x)
            delta = 0.000000001
            self.assertAlmostEqual(numpy.abs((normalized_x - y)).max(), 0, delta=delta)
            self.assertEqual(y_data, x_data)
            self.assertAlmostEqual(y_distance, 0.0, delta=delta)
Example #47
0
class TestEngine(unittest.TestCase):

    def setUp(self):
        self.engine = Engine(1000)

    def test_storage_issue(self):
        engine1 = Engine(100)
        engine2 = Engine(100)

        for k in range(1000):
            x = numpy.random.randn(100)
            x_data = 'data'
            engine1.store_vector(x, x_data)

        # Each engine should have its own default storage
        self.assertTrue(len(engine2.storage.buckets)==0)

    def test_retrieval(self):
        for k in range(100):
            self.engine.clean_all_buckets()
            x = numpy.random.randn(1000)
            x_data = 'data'
            self.engine.store_vector(x, x_data)
            n = self.engine.neighbours(x)
            y = n[0][0]
            y_data = n[0][1]
            y_distance = n[0][2]
            self.assertTrue((y == x).all())
            self.assertEqual(y_data, x_data)
            self.assertEqual(y_distance, 0.0)

    def test_retrieval_sparse(self):
        for k in range(100):
            self.engine.clean_all_buckets()
            x = scipy.sparse.rand(1000, 1, density=0.05)
            x_data = 'data'
            self.engine.store_vector(x, x_data)
            n = self.engine.neighbours(x)
            y = n[0][0]
            y_data = n[0][1]
            y_distance = n[0][2]
            self.assertTrue((y - x).sum() == 0.0)
            self.assertEqual(y_data, x_data)
            self.assertEqual(y_distance, 0.0)
Example #48
0
	def load_PCA(self,vector_set):
		"""reinitializes our engine and loads a numpy set of vectors of dimension (self.biggest,1) 
		into self.engine_"""
		rbp = RandomBinaryProjections('rbp', 10)
		self.engine_ = Engine(self.PCA_.components_.shape[1], lshashes=[rbp])
                transformed_vectors = self.PCA_.transform(vector_set.T)
		for i in range(len(list(self.training_))):
			#vector=vector_set[:,i]                        
			#vector=np.reshape(vector,(self.biggest,1))
			#vector=self.PCA_.transform(vector)
			self.engine_.store_vector(transformed_vectors[i,:], self.training_[i])
Example #49
0
    def _create_engine(self, k, lshashes=None):
        self.k_ = k
        self.engine_ = Engine(self.dimension_, lshashes,
                              distance=self.dist_metric_,
                              vector_filters=[NearestFilter(k)])

        for i, feature in enumerate(self.featurized_):
            if self.transpose_:
                self.engine_.store_vector(feature.T, i)
            else:
                self.engine_.store_vector(feature, i)
Example #50
0
class lshsearcher:
    def __init__(self):
        self.__dimension = None
        self.__engine_perm = None
        self.__permutations = None

    def _set_confval(self, dimension=None):
        if dimension is None:
            return None
        else:
            self.__dimension = dimension

    def _engine_on(self):
        # Create permutations meta-hash
        self.__permutations = HashPermutations('permut')

        # Create binary hash as child hash
        rbp_perm = RandomBinaryProjections('rbp_perm', 14)
        rbp_conf = {'num_permutation':50,'beam_size':10,'num_neighbour':100}

        # Add rbp as child hash of permutations hash
        self.__permutations.add_child_hash(rbp_perm, rbp_conf)

        # Create engine
        self.__engine_perm = Engine(self.__dimension, lshashes=[self.__permutations], distance=CosineDistance())

    def conf(self, dimension):
        self._set_confval(dimension)
        self._engine_on()

    def getData(self, v):
        if self.__engine_perm is not None:
            self.__engine_perm.store_vector(v)

    def commitData(self):
        if self.__permutations is not None:
            self.__permutations.build_permuted_index()

    def find(self, v):
        if self.__engine_perm is not None:
            return self.__engine_perm.neighbours(v)
Example #51
0
    def _engine_on(self):
        # Create permutations meta-hash
        self.__permutations = HashPermutations('permut')

        # Create binary hash as child hash
        rbp_perm = RandomBinaryProjections('rbp_perm', 14)
        rbp_conf = {'num_permutation':50,'beam_size':10,'num_neighbour':100}

        # Add rbp as child hash of permutations hash
        self.__permutations.add_child_hash(rbp_perm, rbp_conf)

        # Create engine
        self.__engine_perm = Engine(self.__dimension, lshashes=[self.__permutations], distance=CosineDistance())
Example #52
0
class NearPy(NearestNeighbor):
    def __init__(self, dist=EuclideanDistance(), phi=lambda x: x):
        NearestNeighbor.__init__(self, dist, phi)

    def _create_engine(self, k, lshashes=None):
        self.k_ = k
        self.engine_ = Engine(self.dimension_, lshashes,
                              distance=self.dist_metric_,
                              vector_filters=[NearestFilter(k)])

        for i, feature in enumerate(self.featurized_):
            if self.transpose_:
                self.engine_.store_vector(feature.T, i)
            else:
                self.engine_.store_vector(feature, i)

    def train(self, data, k=10):
        self.data_ = np.array(data)
        self.featurized_ = self.featurize(data)

        shape = featurized[0].shape
        assert len(shape) <= 2, 'Feature shape must be (1, N), (N, 1), or (N,)'
        if len(shape) == 1:
            self.transpose_ = False
            self.dimension_ = shape[0]
        else:
            assert 1 in shape, 'Feature shape must be (1, N) or (N, 1)'
            self.transpose_ = (shape[0] == 1)
            self.dimension_ = shape[1] if self.transpose_ else shape[0]

        logging.info('Constructing nearest neighbor data structure.')
        train_start = time.clock()
        self._create_engine(k)
        train_end = time.clock()
#        logging.info('Took %f sec' %(train_end - train_start))

    def within_distance(x, dist=0.5, return_indices=False):
        raise NotImplementedError

    def nearest_neighbors(self, x, k, return_indices=False):
        # HACK: load all data back into new engine if k doesn't match
        if k != self.k_:
            self._create_engine(k)

        feature = self.phi_(x)
        if self.transpose_:
            query_result = self.engine_.neighbours(feature.T)
        else:
            query_result = self.engine_.neighbours(feature)

        if len(query_result) == 0:
            return [], []

        features, indices, distances = zip(*query_result)
        if return_indices:
            return list(indices), list(distances)
        else:
            indices = np.array(indices)
            return list(self.data_[indices]), list(distances)
Example #53
0
class TestPermutation(unittest.TestCase):

    def setUp(self):
        logging.basicConfig(level=logging.WARNING)

        # Create permutations meta-hash
        self.permutations = HashPermutations('permut')

        # Create binary hash as child hash
        rbp = RandomBinaryProjections('rbp1', 4)
        rbp_conf = {'num_permutation':50,'beam_size':10,'num_neighbour':100}

        # Add rbp as child hash of permutations hash
        self.permutations.add_child_hash(rbp, rbp_conf)

        # Create engine with meta hash and cosine distance
        self.engine_perm = Engine(200, lshashes=[self.permutations], distance=CosineDistance())

        # Create engine without permutation meta-hash
        self.engine = Engine(200, lshashes=[rbp], distance=CosineDistance())

    def test_runnable(self):

        # First index some random vectors
        matrix = numpy.zeros((1000,200))
        for i in xrange(1000):
            v = numpy.random.randn(200)
            matrix[i] = v
            self.engine.store_vector(v)
            self.engine_perm.store_vector(v)

        # Then update permuted index
        self.permutations.build_permuted_index()

        # Do random query on engine with permutations meta-hash
        print '\nNeighbour distances with permuted index:'
        query = numpy.random.randn(200)
        results = self.engine_perm.neighbours(query)
        dists = [x[2] for x in results]
        print dists

        # Do random query on engine without permutations meta-hash
        print '\nNeighbour distances without permuted index (distances should be larger):'
        results = self.engine.neighbours(query)
        dists = [x[2] for x in results]
        print dists

        # Real neighbours
        print '\nReal neighbour distances:'
        query = query.reshape((1,200))
        dists = CosineDistance().distance_matrix(matrix,query)
        dists = dists.reshape((-1,))
        dists = sorted(dists)
        print dists[:10]
    def test_retrieval(self):
        # We want 12 projections, 20 results at least
        rbpt = RandomBinaryProjectionTree('testHash', 12, 20)

        # Create engine for 100 dimensional feature space, do not forget to set
        # nearest filter to 20, because default is 10
        self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)])

        # First insert 200000 random vectors
        for k in range(200000):
            x = numpy.random.randn(100)
            x_data = 'data {}'.format(k)
            self.engine.store_vector(x, x_data)

        # Now do random queries and check result set size
        for k in range(10):
            x = numpy.random.randn(100)
            n = self.engine.neighbours(x)
            self.assertEqual(len(n), 20)
Example #55
0
def example2():

    # Dimension of feature space
    DIM = 100

    # Number of data points (dont do too much because of exact search)
    POINTS = 20000

    ##########################################################

    print 'Performing indexing with HashPermutations...'
    t0 = time.time()

    # Create permutations meta-hash
    permutations = HashPermutations('permut')

    # Create binary hash as child hash
    rbp_perm = RandomBinaryProjections('rbp_perm', 14)
    rbp_conf = {'num_permutation':50,'beam_size':10,'num_neighbour':100}

    # Add rbp as child hash of permutations hash
    permutations.add_child_hash(rbp_perm, rbp_conf)

    # Create engine
    engine_perm = Engine(DIM, lshashes=[permutations], distance=CosineDistance())

    # First index some random vectors
    matrix = numpy.zeros((POINTS,DIM))
    for i in xrange(POINTS):
        v = numpy.random.randn(DIM)
        matrix[i] = v
        engine_perm.store_vector(v)

    # Then update permuted index
    permutations.build_permuted_index()

    t1 = time.time()
    print 'Indexing took %f seconds' % (t1-t0)

    # Get random query vector
    query = numpy.random.randn(DIM)

    # Do random query on engine 3
    print '\nNeighbour distances with HashPermutations:'
    print '  -> Candidate count is %d' % engine_perm.candidate_count(query)
    results = engine_perm.neighbours(query)
    dists = [x[2] for x in results]
    print dists

    # Real neighbours
    print '\nReal neighbour distances:'
    query = query.reshape((1,DIM))
    dists = CosineDistance().distance_matrix(matrix,query)
    dists = dists.reshape((-1,))
    dists = sorted(dists)
    print dists[:10]

    ##########################################################

    print '\nPerforming indexing with HashPermutationMapper...'
    t0 = time.time()

    # Create permutations meta-hash
    permutations2 = HashPermutationMapper('permut2')

    # Create binary hash as child hash
    rbp_perm2 = RandomBinaryProjections('rbp_perm2', 14)

    # Add rbp as child hash of permutations hash
    permutations2.add_child_hash(rbp_perm2)

    # Create engine
    engine_perm2 = Engine(DIM, lshashes=[permutations2], distance=CosineDistance())

    # First index some random vectors
    matrix = numpy.zeros((POINTS,DIM))
    for i in xrange(POINTS):
        v = numpy.random.randn(DIM)
        matrix[i] = v
        engine_perm2.store_vector(v)

    t1 = time.time()
    print 'Indexing took %f seconds' % (t1-t0)

    # Get random query vector
    query = numpy.random.randn(DIM)

    # Do random query on engine 4
    print '\nNeighbour distances with HashPermutationMapper:'
    print '  -> Candidate count is %d' % engine_perm2.candidate_count(query)
    results = engine_perm2.neighbours(query)
    dists = [x[2] for x in results]
    print dists

    # Real neighbours
    print '\nReal neighbour distances:'
    query = query.reshape((1,DIM))
    dists = CosineDistance().distance_matrix(matrix,query)
    dists = dists.reshape((-1,))
    dists = sorted(dists)
    print dists[:10]

    ##########################################################

    print '\nPerforming indexing with mutliple binary hashes...'
    t0 = time.time()

    hashes = []
    for k in range(20):
        hashes.append(RandomBinaryProjections('rbp_%d' % k, 10))

    # Create engine
    engine_rbps = Engine(DIM, lshashes=hashes, distance=CosineDistance())

    # First index some random vectors
    matrix = numpy.zeros((POINTS,DIM))
    for i in xrange(POINTS):
        v = numpy.random.randn(DIM)
        matrix[i] = v
        engine_rbps.store_vector(v)

    t1 = time.time()
    print 'Indexing took %f seconds' % (t1-t0)

    # Get random query vector
    query = numpy.random.randn(DIM)

    # Do random query on engine 4
    print '\nNeighbour distances with mutliple binary hashes:'
    print '  -> Candidate count is %d' % engine_rbps.candidate_count(query)
    results = engine_rbps.neighbours(query)
    dists = [x[2] for x in results]
    print dists

    # Real neighbours
    print '\nReal neighbour distances:'
    query = query.reshape((1,DIM))
    dists = CosineDistance().distance_matrix(matrix,query)
    dists = dists.reshape((-1,))
    dists = sorted(dists)
    print dists[:10]
Example #56
0
 def test_delete_vector_with_provided_value(self):
     engine = Engine(self.dim, lshashes=[UniBucket('testHash')])
     self.fill_engine(engine)
     engine.delete_vector(self.removed_value, self.removed_vector)
     self.check_delete(engine)
Example #57
0
 def setUp(self):
     self.engine = Engine(1000)
Example #58
0
class LSHSearch:
    def __init__(self, feature_file, dimension, neighbour, lsh_project_num):
        self.feature_file = feature_file
        self.dimension = dimension
        self.neighbour = neighbour
        self.face_feature = defaultdict(str)
        self.ground_truth = defaultdict(int)

        # Create permutations meta-hash
        permutations2 = HashPermutationMapper('permut2')

        tmp_feature = defaultdict(str)
        with open(feature_file, 'rb') as f:
            reader = csv.reader(f, delimiter=' ')
            for name, feature in reader:
                tmp_feature[name] = feature

        matrix = []
        label = []
        for item in tmp_feature.keys():
            v = map(float, tmp_feature[item].split(','))
            matrix.append(np.array(v))
            label.append(item)
        random.shuffle(matrix)
        print 'PCA matric : ', len(matrix)

        rbp_perm2 = PCABinaryProjections('testPCABPHash', lsh_project_num, matrix)
        permutations2.add_child_hash(rbp_perm2)

        # Create engine
        nearest = NearestFilter(self.neighbour)
        self.engine = Engine(self.dimension, lshashes=[permutations2], distance=CosineDistance(), vector_filters=[nearest])

    def build(self):
        with open(self.feature_file, 'rb') as f:
            reader = csv.reader(f, delimiter=' ')
            for name, feature in reader:
                self.face_feature[name] = feature
                person = '_'.join(name.split('_')[:-1])
                self.ground_truth[person] += 1 

        for item in self.face_feature.keys():
            v = map(float, self.face_feature[item].split(','))
            self.engine.store_vector(v, item)
 
    def query(self, person_list):
        dists = []
        scores = []
        for person in person_list:
            query = map(float, self.face_feature[person].split(','))
            print '\nNeighbour distances with mutliple binary hashes:'
            print '  -> Candidate count is %d' % self.engine.candidate_count(query)
            results = self.engine.neighbours(query)
            dists = dists + [x[1] for x in results]
            scores = scores + [x[2] for x in results]
        t_num = [self.ground_truth['_'.join(x.split('_')[:-1])] for x in dists]
        res = zip(dists, scores, t_num)
        res.sort(key = lambda t: t[1])
        res1 = self.f7(res, person_list)
        return res1[:self.neighbour]

    def true_num(self, person):
        return self.ground_truth[person]

    def f7(self, zip_seq, person_list):
        seen = set()
        seen_add = seen.add
        return [ x for x in zip_seq if not (x[0] in seen or seen_add(x[0]) or x[0] in person_list)]
class TestRandomBinaryProjectionTree(unittest.TestCase):

    def setUp(self):
        self.memory = MemoryStorage()
        self.redis_object = Redis(host='localhost',
                                  port=6379, db=0)
        self.redis_storage = RedisStorage(self.redis_object)

    def test_retrieval(self):
        # We want 12 projections, 20 results at least
        rbpt = RandomBinaryProjectionTree('testHash', 12, 20)

        # Create engine for 100 dimensional feature space, do not forget to set
        # nearest filter to 20, because default is 10
        self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)])

        # First insert 200000 random vectors
        #print 'Indexing...'
        for k in range(200000):
            x = numpy.random.randn(100)
            x_data = 'data'
            self.engine.store_vector(x, x_data)

        # Now do random queries and check result set size
        #print 'Querying...'
        for k in range(10):
            x = numpy.random.randn(100)
            n = self.engine.neighbours(x)
            #print "Candidate count = %d" % self.engine.candidate_count(x)
            #print "Result size = %d" % len(n)
            self.assertEqual(len(n), 20)

    def test_storage_memory(self):
        # We want 10 projections, 20 results at least
        rbpt = RandomBinaryProjectionTree('testHash', 10, 20)

        # Create engine for 100 dimensional feature space
        self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)])

        # First insert 2000 random vectors
        for k in range(2000):
            x = numpy.random.randn(100)
            x_data = 'data'
            self.engine.store_vector(x, x_data)

        self.memory.store_hash_configuration(rbpt)

        rbpt2 = RandomBinaryProjectionTree(None, None, None)
        rbpt2.apply_config(self.memory.load_hash_configuration('testHash'))

        self.assertEqual(rbpt.dim, rbpt2.dim)
        self.assertEqual(rbpt.hash_name, rbpt2.hash_name)
        self.assertEqual(rbpt.projection_count, rbpt2.projection_count)

        for i in range(rbpt.normals.shape[0]):
            for j in range(rbpt.normals.shape[1]):
                self.assertEqual(rbpt.normals[i, j], rbpt2.normals[i, j])

        # Now do random queries and check result set size
        for k in range(10):
            x = numpy.random.randn(100)
            keys1 = rbpt.hash_vector(x, querying=True)
            keys2 = rbpt2.hash_vector(x, querying=True)
            self.assertEqual(len(keys1), len(keys2))
            for k in range(len(keys1)):
                self.assertEqual(keys1[k], keys2[k])

    def test_storage_redis(self):
        # We want 10 projections, 20 results at least
        rbpt = RandomBinaryProjectionTree('testHash', 10, 20)

        # Create engine for 100 dimensional feature space
        self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)])

        # First insert 2000 random vectors
        for k in range(2000):
            x = numpy.random.randn(100)
            x_data = 'data'
            self.engine.store_vector(x, x_data)


        self.redis_storage.store_hash_configuration(rbpt)

        rbpt2 = RandomBinaryProjectionTree(None, None, None)
        rbpt2.apply_config(self.redis_storage.load_hash_configuration('testHash'))

        self.assertEqual(rbpt.dim, rbpt2.dim)
        self.assertEqual(rbpt.hash_name, rbpt2.hash_name)
        self.assertEqual(rbpt.projection_count, rbpt2.projection_count)

        for i in range(rbpt.normals.shape[0]):
            for j in range(rbpt.normals.shape[1]):
                self.assertEqual(rbpt.normals[i, j], rbpt2.normals[i, j])

        # Now do random queries and check result set size
        for k in range(10):
            x = numpy.random.randn(100)
            keys1 = rbpt.hash_vector(x, querying=True)
            keys2 = rbpt2.hash_vector(x, querying=True)
            self.assertEqual(len(keys1), len(keys2))
            for k in range(len(keys1)):
                self.assertEqual(keys1[k], keys2[k])