Ejemplo n.º 1
0
class TestEngine(unittest.TestCase):

    def setUp(self):
        self.engine = Engine(1000)

    def test_retrieval(self):
        for k in range(100):
            self.engine.clean_all_buckets()
            x = numpy.random.randn(1000)
            x_data = 'data'
            self.engine.store_vector(x, x_data)
            n = self.engine.neighbours(x)
            y = n[0][0]
            y_data = n[0][1]
            y_distance = n[0][2]
            self.assertTrue((y == x).all())
            self.assertEqual(y_data, x_data)
            self.assertEqual(y_distance, 0.0)

    def test_retrieval_sparse(self):
        for k in range(100):
            self.engine.clean_all_buckets()
            x = scipy.sparse.rand(1000, 1, density=0.05)
            x_data = 'data'
            self.engine.store_vector(x, x_data)
            n = self.engine.neighbours(x)
            y = n[0][0]
            y_data = n[0][1]
            y_distance = n[0][2]
            self.assertTrue((y - x).sum() == 0.0)
            self.assertEqual(y_data, x_data)
            self.assertEqual(y_distance, 0.0)
Ejemplo n.º 2
0
def load_search_engine():
    global engine

    # read in the data file
    data = pandas.read_csv(os.path.join('data', 'features.tsv'), sep='\t')
    data_objects = pandas.read_csv(os.path.join('data', 'object_features.tsv'),
                                   sep='\t')

    # Create a random binary hash with 10 bits
    rbp = RandomBinaryProjections('rbp', 10)

    # Create engine with pipeline configuration
    engine = Engine(len(data['features'][0].split(',')),
                    lshashes=[rbp],
                    distance=EuclideanDistance())

    # indexing
    for i in range(0, len(data)):
        engine.store_vector(
            np.asarray(data['features'][i].split(',')).astype('float64'),
            data['filename'][i].replace('images\\\\',
                                        '').replace('images\\',
                                                    '').replace('images/', ''))

    for i in range(0, len(data_objects)):
        engine.store_vector(
            np.asarray(
                data_objects['features'][i].split(',')).astype('float64'),
            data_objects['filename'][i].replace('images\\\\', '').replace(
                'images\\', '').replace('images/', ''))

    return engine
Ejemplo n.º 3
0
def knn(data, k):
    assert k <= len(
        data
    ) - 1, 'The number of neighbors must be smaller than the data cardinality (minus one)'
    k = k + 1
    n, dimension = data.shape
    ind = []
    dist = []

    if (dimension < 10):
        rbp = RandomBinaryProjections('rbp', dimension)
    else:
        rbp = RandomBinaryProjections('rbp', 10)

    engine = Engine(dimension,
                    lshashes=[rbp],
                    vector_filters=[NearestFilter(k)])

    for i in range(n):
        engine.store_vector(data[i], i)

    for i in range(n):

        N = engine.neighbours(data[i])
        ind.append([x[1] for x in N][1:])
        dist.append([x[2] for x in N][1:])

    return N, dist, ind
Ejemplo n.º 4
0
def index_user_vectors():
	
	print 'Performing indexing with HashPermutations...'
	
	global engine_perm 
	
	t0 = time.time()
	
	print k_dimen, d_dimen
	
	rbp_perm = RandomBinaryProjections('rbp_perm', d_dimen)
	
	rbp_perm.reset(k_dimen)
	
	# Create permutations meta-hash
	permutations = HashPermutations('permut')
	
	rbp_conf = {'num_permutation':50,'beam_size':10,'num_neighbour':250}
	
        # Add rbp as child hash of permutations hash
	permutations.add_child_hash(rbp_perm, rbp_conf)
	
        # Create engine
        engine_perm = Engine(k_dimen, lshashes=[permutations], distance=CosineDistance())
    
	for u in user_vector:
		
		engine_perm.store_vector(user_vector[u], data=u)
		
	 # Then update permuted index
        permutations.build_permuted_index()
    
	t1 = time.time()
	
	print 'Indexing took %f seconds', (t1-t0)
Ejemplo n.º 5
0
class LSH:
    def __init__(self, path, dataSize):
        self.path = path
        self.dataSize = dataSize

    def preprocess(self):
        ids = []
        meta = []
        data = []

        for i in range(self.dataSize):
            with open(self.path + str(i) + ".data", "rb") as file:
                f_song_id = pickle.load(file)
                f_songMeta = pickle.load(file)
                f_data = pickle.load(file)
                ids.append(f_song_id)
                meta.append(f_songMeta)
                data.append(f_data)

        self.id = np.array(ids)
        self.meta = np.array(meta)
        self.data = np.array(data)

    def generate_hashtable(self):
        self.engine = Engine(self.data.shape[1],
                             lshashes=[RandomBinaryProjections('rbp', 20)])

        for i in range(self.dataSize):
            self.engine.store_vector(self.data[i], data=self.id[i])

    def query(self, data):
        return self.engine.neighbours(data)
Ejemplo n.º 6
0
def index_user_vectors():

    #print 'Performing indexing with HashPermutations...'

    global engine_perm

    t0 = time.time()

    #print k_dimen, d_dimen

    rbp_perm = RandomBinaryProjections('rbp_perm', d_dimen)

    rbp_perm.reset(k_dimen)

    # Create permutations meta-hash
    permutations = HashPermutations('permut')

    rbp_conf = {'num_permutation': 50, 'beam_size': 10, 'num_neighbour': 250}

    # Add rbp as child hash of permutations hash
    permutations.add_child_hash(rbp_perm, rbp_conf)

    # Create engine
    engine_perm = Engine(k_dimen,
                         lshashes=[permutations],
                         distance=CosineDistance())

    for u in user_vector:

        engine_perm.store_vector(user_vector[u], data=u)

    # Then update permuted index
    permutations.build_permuted_index()

    t1 = time.time()
Ejemplo n.º 7
0
class TestEngine(unittest.TestCase):
    def setUp(self):
        self.engine = Engine(1000)

    def test_retrieval(self):
        for k in range(100):
            self.engine.clean_all_buckets()
            x = numpy.random.randn(1000)
            x_data = 'data'
            self.engine.store_vector(x, x_data)
            n = self.engine.neighbours(x)
            y = n[0][0]
            y_data = n[0][1]
            y_distance = n[0][2]
            self.assertTrue((y == x).all())
            self.assertEqual(y_data, x_data)
            self.assertEqual(y_distance, 0.0)

    def test_retrieval_sparse(self):
        for k in range(100):
            self.engine.clean_all_buckets()
            x = scipy.sparse.rand(1000, 1, density=0.05)
            x_data = 'data'
            self.engine.store_vector(x, x_data)
            n = self.engine.neighbours(x)
            y = n[0][0]
            y_data = n[0][1]
            y_distance = n[0][2]
            self.assertTrue((y - x).sum() == 0.0)
            self.assertEqual(y_data, x_data)
            self.assertEqual(y_distance, 0.0)
Ejemplo n.º 8
0
        def RunAnnNearpy(q):
            totalTimer = Timer()

            # Load input dataset.
            Log.Info("Loading dataset", self.verbose)
            queryData = np.genfromtxt(self.dataset[1], delimiter=',')
            train, label = SplitTrainData(self.dataset)

            with totalTimer:
                # Get all the parameters.
                try:
                    # Perform Approximate Nearest-Neighbors
                    dimension = train.shape[1]
                    rbp = RandomBinaryProjections('rbp', 10)
                    engine = Engine(dimension, lshashes=[rbp])
                    for i in range(len(train)):
                        engine.store_vector(train[i], 'data_%d' % i)
                    for i in range(len(queryData)):
                        v = engine.neighbours(queryData[i])
                except Exception as e:
                    Log.Info(e)
                    q.put(e)
                    return -1
            time = totalTimer.ElapsedTime()
            q.put(time)
            return time
Ejemplo n.º 9
0
class StateDBEngine(object):
    def __init__(self):
        # initialize "nearby" library
        self.dim = 4
        self.rbp = RandomBinaryProjections('rbp', 100)
        self.engine = Engine(self.dim, lshashes=[self.rbp])
        # performance counter
        self.counter = 0

    def add(self, x, data):
        # print 'add data = ', data
        self.engine.store_vector(x, data)
        self.counter += 1

    def lookup(self, x, THRESHOLD=0.1):
        naver = self.engine.neighbours(x)
        if len(naver) == 0:
            return None

        pt, data, d = naver[0]
        # print 'lhs, rhs', x, pt,
        # print 'd = ', d, (d < THRESHOLD), (data is None)
        if d < THRESHOLD:
            return data
        else:
            return None
Ejemplo n.º 10
0
def index_in_text_engine(nid_gen,
                         tfidf,
                         lsh_projections,
                         tfidf_is_dense=False):
    num_features = tfidf.shape[1]
    print("TF-IDF shape: " + str(tfidf.shape))
    text_engine = Engine(num_features,
                         lshashes=[lsh_projections],
                         distance=CosineDistance())

    st = time.time()
    row_idx = 0
    for key in nid_gen:
        if tfidf_is_dense:
            dense_row = tfidf[row_idx]
            array = dense_row
        else:
            sparse_row = tfidf.getrow(row_idx)
            dense_row = sparse_row.todense()
            array = dense_row.A[0]
        row_idx += 1
        text_engine.store_vector(array, key)
    et = time.time()
    print("Total index text: " + str((et - st)))
    return text_engine
Ejemplo n.º 11
0
class PointCalculator():
    def __init__(self, point_list, point):
        self.__configure_calculator(point_list, point)

    def __configure_calculator(self, point_list, point):
        # Dimension of our vector space
        self.__dimension__ = 2

        # Create a random binary hash with 10 bits
        self.__rbp__ = RandomBinaryProjections('rbp', 10)

        # Create engine with pipeline configuration
        self.__engine__ = Engine(self.__dimension__, lshashes=[self.__rbp__])
        self.set_searching_point_list(point_list)
        self.set_query_point(point)

    def __load_point_list_in_engine(self):
        for index in xrange(0, len(self.__point_list__)):
            v = numpy.array(self.__point_list__[index])
            self.__engine__.store_vector(v, 'data_%d' % index)

    def set_searching_point_list(self, point_list):
        self.__point_list__ = point_list
        self.__load_point_list_in_engine()

    def set_query_point(self, point):
        self.__point__ = point

    def __get_nearest_point(self):
        return self.__engine__.neighbours(numpy.array(self.__point__))

    def get_nearest_point_array_coords(self):
        nearest_point = self.__get_nearest_point()
        return [nearest_point[0][0][0], nearest_point[0][0][1]]
Ejemplo n.º 12
0
class RandomBinaryNN(NearestNeighbor):
    """ Nearest neighbor implementation by using random binary trees from nearpy package """
    def __init__(self, dimension: int, number_projections: int,
                 threshold: float):
        """
        :param dimension:
            Number of dimensions of input points
        :param number_projections:
            Number of random projections used for finding nearest neighbors.
            Trade-off: More projections result in a smaller number of false positives in candidate set
        :param threshold:
            Distance threshold for definition nearest: all points within this specific distance
        """
        self.rbp = RandomBinaryProjections('rbp', number_projections)
        self.sqdist = SquaredEuclideanDistance()
        self.ann_engine = Engine(
            dimension,
            lshashes=[self.rbp],
            distance=self.sqdist,
            vector_filters=[DistanceThresholdFilter(threshold)])

    def insert_candidate(self, point: np.ndarray, metadata):
        self.ann_engine.store_vector(point, data=metadata)

    def get_candidates(self, point: np.ndarray):
        return [
            NearestNeighborResult(res[0], res[1], res[2])
            for res in self.ann_engine.neighbours(point)
        ]
def LSH(Layers, K):

    lsh_vectors = database[:, LSH_VECT_START_COL:]
    video_data = database[:, 0:5]

    num_rows, num_cols = lsh_vectors.shape
    dimension = num_cols

    rbp = list()
    for i in range(Layers):
        rbp.append(RandomBinaryProjections(str(i), K))

    # Create engine with pipeline configuration
    engine = Engine(dimension, lshashes=rbp)

    # Index 1000000 random vectors (set their data zo a unique string)
    for index in range(num_rows):
        v = lsh_vectors[index, :]

        meta_data = str(index)+',' + str(int(video_data[index, 0])) + ', ' + str(int(video_data[index, 1])) + ', ' + str(int(video_data[index, 2])) \
                    + ', ' + str(video_data[index, 3]) + ', ' + str(video_data[index, 4])

        engine.store_vector(v, meta_data)

    printOutput(engine.storage.buckets)

    print 'stop'
Ejemplo n.º 14
0
def knn(data,k):
    assert k<=len(data)-1, 'The number of neighbors must be smaller than the data cardinality (minus one)'
    k=k+1
    n,dimension = data.shape
    ind = []
    dist = []
    

    if(dimension<10):
        rbp = RandomBinaryProjections('rbp', dimension)
    else:
        rbp = RandomBinaryProjections('rbp',10)
        
    engine = Engine(dimension, lshashes=[rbp], vector_filters=[NearestFilter(k)])

    for i in range(n):
        engine.store_vector(data[i], i)
    
    
    for i in range(n):
     
        N = engine.neighbours(data[i])
        ind.append([x[1] for x in N][1:])
        dist.append([x[2] for x in N][1:])
        
  
    return N,dist,ind
Ejemplo n.º 15
0
def test_nearpy(X_train, y_train, X_test, k):
    # We are looking for the k closest neighbours
    nearest = NearestFilter(k)
    X_train_normalized = []
    for i in range(len(X_train)):
        train_example = X_train[i]
        element = ((train_example / np.linalg.norm(train_example)).tolist(),
                   y_train[i].tolist())
        X_train_normalized.append(element)

    engine = Engine(X_train.shape[1],
                    lshashes=[RandomBinaryProjections('default', 10)],
                    distance=CosineDistance(),
                    vector_filters=[nearest])

    #perform hashing for train examples
    for train_example in X_train:
        engine.store_vector(train_example)

    labels = []
    for test_example in X_test:
        neighbors = engine.neighbours(test_example)
        labels.append([
            train_example[1] for train_example in X_train_normalized
            if set(neighbors[0][0]) == set(train_example[0])
        ])
    return labels
Ejemplo n.º 16
0
class NearPy(NearestNeighbor):
    def __init__(self, dist=EuclideanDistance(), phi=lambda x: x):
        NearestNeighbor.__init__(self, dist, phi)

    def _create_engine(self, k, lshashes=None):
        self.k_ = k
        self.engine_ = Engine(self.dimension_, lshashes,
                              distance=self.dist_metric_,
                              vector_filters=[NearestFilter(k)])

        for i, feature in enumerate(self.featurized_):
            if self.transpose_:
                self.engine_.store_vector(feature.T, i)
            else:
                self.engine_.store_vector(feature, i)

    def train(self, data, k=10):
        self.data_ = np.array(data)
        self.featurized_ = self.featurize(data)

        shape = featurized[0].shape
        assert len(shape) <= 2, 'Feature shape must be (1, N), (N, 1), or (N,)'
        if len(shape) == 1:
            self.transpose_ = False
            self.dimension_ = shape[0]
        else:
            assert 1 in shape, 'Feature shape must be (1, N) or (N, 1)'
            self.transpose_ = (shape[0] == 1)
            self.dimension_ = shape[1] if self.transpose_ else shape[0]

        logging.info('Constructing nearest neighbor data structure.')
        train_start = time.clock()
        self._create_engine(k)
        train_end = time.clock()
#        logging.info('Took %f sec' %(train_end - train_start))

    def within_distance(x, dist=0.5, return_indices=False):
        raise NotImplementedError

    def nearest_neighbors(self, x, k, return_indices=False):
        # HACK: load all data back into new engine if k doesn't match
        if k != self.k_:
            self._create_engine(k)

        feature = self.phi_(x)
        if self.transpose_:
            query_result = self.engine_.neighbours(feature.T)
        else:
            query_result = self.engine_.neighbours(feature)

        if len(query_result) == 0:
            return [], []

        features, indices, distances = zip(*query_result)
        if return_indices:
            return list(indices), list(distances)
        else:
            indices = np.array(indices)
            return list(self.data_[indices]), list(distances)
Ejemplo n.º 17
0
class TestPermutation(unittest.TestCase):
    def setUp(self):
        logging.basicConfig(level=logging.WARNING)

        # Create permutations meta-hash
        self.permutations = HashPermutations('permut')

        # Create binary hash as child hash
        rbp = RandomBinaryProjections('rbp1', 4)
        rbp_conf = {
            'num_permutation': 50,
            'beam_size': 10,
            'num_neighbour': 100
        }

        # Add rbp as child hash of permutations hash
        self.permutations.add_child_hash(rbp, rbp_conf)

        # Create engine with meta hash and cosine distance
        self.engine_perm = Engine(200,
                                  lshashes=[self.permutations],
                                  distance=CosineDistance())

        # Create engine without permutation meta-hash
        self.engine = Engine(200, lshashes=[rbp], distance=CosineDistance())

    def test_runnable(self):

        # First index some random vectors
        matrix = numpy.zeros((1000, 200))
        for i in xrange(1000):
            v = numpy.random.randn(200)
            matrix[i] = v
            self.engine.store_vector(v)
            self.engine_perm.store_vector(v)

        # Then update permuted index
        self.permutations.build_permuted_index()

        # Do random query on engine with permutations meta-hash
        print '\nNeighbour distances with permuted index:'
        query = numpy.random.randn(200)
        results = self.engine_perm.neighbours(query)
        dists = [x[2] for x in results]
        print dists

        # Do random query on engine without permutations meta-hash
        print '\nNeighbour distances without permuted index (distances should be larger):'
        results = self.engine.neighbours(query)
        dists = [x[2] for x in results]
        print dists

        # Real neighbours
        print '\nReal neighbour distances:'
        query = query.reshape((1, 200))
        dists = CosineDistance().distance_matrix(matrix, query)
        dists = dists.reshape((-1, ))
        dists = sorted(dists)
        print dists[:10]
Ejemplo n.º 18
0
def main(args):
    """ Main entry.
    """

    data = Dataset(args.dataset)
    num, dim = data.base.shape

    # We are looking for the ten closest neighbours
    nearest = NearestFilter(args.topk)
    # We want unique candidates
    unique = UniqueFilter()

    # Create engines for all configurations
    for nbit, ntbl in itertools.product(args.nbits, args.ntbls):
        logging.info("Creating Engine ...")
        lshashes = [RandomBinaryProjections('rbp%d' % i, nbit)
                    for i in xrange(ntbl)]

        # Create engine with this configuration
        engine = Engine(dim, lshashes=lshashes,
                        vector_filters=[unique, nearest])
        logging.info("\tDone!")

        logging.info("Adding items ...")
        for i in xrange(num):
            engine.store_vector(data.base[i, :], i)
            if i % 100000 == 0:
                logging.info("\t%d/%d" % (i, data.nbae))
        logging.info("\tDone!")

        ids = np.zeros((data.nqry, args.topk), np.int)
        logging.info("Searching ...")
        tic()
        for i in xrange(data.nqry):
            reti = [y for x, y, z in
                    np.array(engine.neighbours(data.query[i]))]
            ids[i, :len(reti)] = reti
            if i % 100 == 0:
                logging.info("\t%d/%d" % (i, data.nqry))
        time_costs = toc()
        logging.info("\tDone!")

        report = os.path.join(args.exp_dir, "report.txt")
        with open(report, "a") as rptf:
            rptf.write("*" * 64 + "\n")
            rptf.write("* %s\n" % time.asctime())
            rptf.write("*" * 64 + "\n")

        r_at_k = compute_stats(data.groundtruth, ids, args.topk)[-1][-1]

        with open(report, "a") as rptf:
            rptf.write("=" * 64 + "\n")
            rptf.write("index_%s-nbit_%d-ntbl_%d\n" % ("NearPy", nbit, ntbl))
            rptf.write("-" * 64 + "\n")
            rptf.write("recall@%-8d%.4f\n" % (args.topk, r_at_k))
            rptf.write("time cost (ms): %.3f\n" %
                       (time_costs * 1000 / data.nqry))
Ejemplo n.º 19
0
def k_nn_lsh_2(k, word, decade_matrix, index_dict):
    num_rows = decade_matrix.get_shape()[0]
    print("the number of rows:" + str(num_rows))
    rbp = RandomBinaryProjections('rbp', 256)
    engine = Engine(num_rows, lshashes=[rbp])
    for i in range(num_rows):
        print(i)

        engine.store_vector(decade_matrix.getrow(i), "data_%d" % i)
    return engine.neighbours(word)
class GenerateHashTable():
    def __init__(self,
                 measure="EuclideanDistance",
                 data_path='data/classed_data/'):
        self.res = ResnetSimilarity()
        self.pbar = ProgressBar()
        # Dimension of our vector space
        self.dimension = 2048
        self.data_path = data_path

        # Create a random binary hash with 10 bits
        self.rbp = RandomBinaryProjections('rbp', 10)

        self.measure = measure
        self.msote = MemoryStorage()
        if measure == "EuclideanDistance":
            self.engine = Engine(self.dimension,
                                 lshashes=[self.rbp],
                                 storage=self.msote,
                                 distance=EuclideanDistance())
        else:
            self.engine = Engine(self.dimension,
                                 lshashes=[self.rbp],
                                 storage=self.msote,
                                 distance=CosineDistance())

    def generate_table(self):
        if self.measure == "CosineDistance":
            save_path = "hashed_objects/hashed_object_Cosine.pkl"
        elif self.measure == "EuclideanDistance":
            save_path = "hashed_objects/hashed_object_euclidean.pkl"
        else:
            save_path = "hashed_objects/" + str(self.measure) + ".pkl"

        count = 0
        for subdir, dirs, files in os.walk(self.data_path):
            for file in files:
                if '.jpg' in file:
                    img_path = os.path.join(subdir, file)
                    img = Image.open(img_path).convert('RGB')

                    if img.size[0] >= 100:
                        img_emb = self.res.getMapping(img)
                        img_emb = img_emb.view(-1, 2048)
                        img_emb = img_emb.numpy()

                        self.engine.store_vector(img_emb[0], img_path)
                        if count % 1000 == 0:
                            print("Saving  Image Embedding ", count)
                        count += 1

        print("Saving File To", save_path)
        # TODO this is peculiar
        filehandler = open(save_path, 'wb')
        pickle.dump(self.engine, filehandler)
Ejemplo n.º 21
0
class CFiltering:
    def __init__(self,
                 matrix,
                 max_neighbours=20,
                 lshashes=[RandomBinaryProjections("rbp", 10)],
                 vector_filters=[UniqueFilter()],
                 distance=Pearson()):

        if not isinstance(lshashes, list):

            raise TypeError("'lshashes' must be an instance of 'list'")

        if not isinstance(vector_filters, list):

            raise TypeError("'vector_filters' must be an instance of 'list'")

        self.underlying = Engine(len(matrix[0]),
                                 lshashes=lshashes,
                                 vector_filters=vector_filters +
                                 [NearestFilter(max_neighbours)],
                                 distance=distance)

        for vector in matrix:

            self.underlying.store_vector(vector)

    def predict(self, vector, precision):

        neighbours = self.underlying.neighbours(vector)

        if not neighbours:

            raise ValueError("Failed to acquire any neighbours")

        average = [
            sum(neighbour) / len(neighbour) for neighbour, _, _ in neighbours
        ]

        avg = sum(vector) / len(vector)

        for i in range(len(vector)):

            if vector[i] < precision:

                weighted_sum = 0

                for j, neighbour in enumerate(neighbours):

                    neighbour, _, similarity = neighbour

                    weighted_sum += similarity * (neighbour[j] - average[j])

                vector[i] = avg + weighted_sum / len(vector)

        return vector
Ejemplo n.º 22
0
    def build_index(self, X):
        f = X.shape[1]
        n = X.shape[0]

        rbp = RandomBinaryProjections('rbp', 32)
        engine = Engine(f, lshashes=[rbp])

        for i in range(n):
            engine.store_vector(X[i], 'data_%d' % i)

        return engine
Ejemplo n.º 23
0
 def get_engine(self, vocab, vecs):
     logging.info('{} hash functions'.format(self.args.projections))
     hashes = [PCABinaryProjections('ne1v', self.args.projections, vecs[:1000,:].T)]
     engine = Engine(vecs.shape[1], lshashes=hashes, distance=[],
                     vector_filters=[])
     for ind, vec in enumerate(vecs):
         if not ind % 100000:                
             logging.info( 
                 '{} words added to nearpy engine'.format(ind))
         engine.store_vector(vec, ind)
     return engine 
Ejemplo n.º 24
0
    def test_storage_issue(self):
        engine1 = Engine(100)
        engine2 = Engine(100)

        for k in range(1000):
            x = numpy.random.randn(100)
            x_data = 'data'
            engine1.store_vector(x, x_data)

        # Each engine should have its own default storage
        self.assertTrue(len(engine2.storage.buckets)==0)
Ejemplo n.º 25
0
class TestPermutation(unittest.TestCase):

    def setUp(self):
        logging.basicConfig(level=logging.WARNING)

        # Create permutations meta-hash
        self.permutations = HashPermutations('permut')

        # Create binary hash as child hash
        rbp = RandomBinaryProjections('rbp1', 4)
        rbp_conf = {'num_permutation':50,'beam_size':10,'num_neighbour':100}

        # Add rbp as child hash of permutations hash
        self.permutations.add_child_hash(rbp, rbp_conf)

        # Create engine with meta hash and cosine distance
        self.engine_perm = Engine(200, lshashes=[self.permutations], distance=CosineDistance())

        # Create engine without permutation meta-hash
        self.engine = Engine(200, lshashes=[rbp], distance=CosineDistance())

    def test_runnable(self):

        # First index some random vectors
        matrix = numpy.zeros((1000,200))
        for i in xrange(1000):
            v = numpy.random.randn(200)
            matrix[i] = v
            self.engine.store_vector(v)
            self.engine_perm.store_vector(v)

        # Then update permuted index
        self.permutations.build_permuted_index()

        # Do random query on engine with permutations meta-hash
        print '\nNeighbour distances with permuted index:'
        query = numpy.random.randn(200)
        results = self.engine_perm.neighbours(query)
        dists = [x[2] for x in results]
        print dists

        # Do random query on engine without permutations meta-hash
        print '\nNeighbour distances without permuted index (distances should be larger):'
        results = self.engine.neighbours(query)
        dists = [x[2] for x in results]
        print dists

        # Real neighbours
        print '\nReal neighbour distances:'
        query = query.reshape((1,200))
        dists = CosineDistance().distance_matrix(matrix,query)
        dists = dists.reshape((-1,))
        dists = sorted(dists)
        print dists[:10]
Ejemplo n.º 26
0
    def test_storage_issue(self):
        engine1 = Engine(100)
        engine2 = Engine(100)

        for k in range(1000):
            x = numpy.random.randn(100)
            x_data = 'data'
            engine1.store_vector(x, x_data)

        # Each engine should have its own default storage
        self.assertTrue(len(engine2.storage.buckets) == 0)
    def build_index(self, X):
        f = X.shape[1]
        n = X.shape[0]

        rbp = RandomBinaryProjections('rbp', 32)
        engine = Engine(f, lshashes=[rbp])

        for i in range(n):
            engine.store_vector(X[i], 'data_%d' % i)

        return engine
Ejemplo n.º 28
0
class Neighbors:
    """ Nearest neighbors. """
    def __init__(self, config, verbose=True, log_file=None):
        # set up logger
        self._logger = Logger.get_logger(self.__class__.__name__,
                                         log_file=log_file,
                                         silence=(not verbose),
                                         global_log_file=verbose)

        # read config
        self._parse_config(config)

        self._engine = None

    def _parse_config(self, config):
        self._num_neighbors = config["num_neighbors"]

    def _build_engine(self, dimension):
        # build NearPy engine
        self._logger.info("Building engine...")
        self._engine = Engine(
            dimension, vector_filters=[NearestFilter(self._num_neighbors)])

    def store(self, vectors, data=None, log_freq=10, verbose=True):
        self._logger.info("Storing vectors...")
        if data is not None:
            assert vectors.shape[0] == len(
                data), "Dim 0 of vectors and data must match!"

        if self._engine is None:
            self._build_engine(vectors.shape[-1])

        num_vectors = vectors.shape[0]
        for idx in xrange(num_vectors):
            if verbose and idx % log_freq == 0:
                self._logger.info("Storing vector {} of {}...".format(
                    idx, num_vectors))
            if data is not None:
                self._engine.store_vector(vectors[idx], data[idx])
            else:
                self._engine.store_vector(vectors[idx])

    def predict(self, vectors, log_freq=10, verbose=True):
        self._logger.info("Predicting...")

        num_vectors = vectors.shape[0]
        neighbors = []
        for idx in xrange(num_vectors):
            if verbose and idx % log_freq == 0:
                self._logger.info("Predicting vector {} of {}...".format(
                    idx, num_vectors))
            neighbors.append(self._engine.neighbours(vectors[idx]))
        return neighbors
Ejemplo n.º 29
0
class TestPermutation(unittest.TestCase):
    def setUp(self):
        logging.basicConfig(level=logging.WARNING)
        numpy.random.seed(11)

        # Create permutations meta-hash
        self.permutations = HashPermutations('permut')

        # Create binary hash as child hash
        rbp = RandomBinaryProjections('rbp1', 4, rand_seed=19)
        rbp_conf = {
            'num_permutation': 50,
            'beam_size': 10,
            'num_neighbour': 100
        }

        # Add rbp as child hash of permutations hash
        self.permutations.add_child_hash(rbp, rbp_conf)

        # Create engine with meta hash and cosine distance
        self.engine_perm = Engine(200,
                                  lshashes=[self.permutations],
                                  distance=CosineDistance())

        # Create engine without permutation meta-hash
        self.engine = Engine(200, lshashes=[rbp], distance=CosineDistance())

    def test_runnable(self):

        # First index some random vectors
        matrix = numpy.zeros((1000, 200))
        for i in xrange(1000):
            v = numpy.random.randn(200)
            matrix[i] = v
            self.engine.store_vector(v)
            self.engine_perm.store_vector(v)

        # Then update permuted index
        self.permutations.build_permuted_index()

        # Do random query on engine with permutations meta-hash
        query = numpy.random.randn(200)
        results = self.engine_perm.neighbours(query)
        permuted_dists = [x[2] for x in results]

        # Do random query on engine without permutations meta-hash (distances
        # should be larger):'
        results = self.engine.neighbours(query)
        dists = [x[2] for x in results]

        self.assertLess(permuted_dists[0], dists[0])
Ejemplo n.º 30
0
class LSHIndex(Index):

    def __init__(self, hasher, number_of_tables=6, length_of_tables=12, match_thresh=0.2, association_thresh=0.1, storage=memoryStorage):
        """
        :param hasher:
        @type hasher: Hasher
        """
        Index.__init__(self, hasher,
                       number_of_tables=number_of_tables,
                       length_of_tables=length_of_tables,
                       match_thresh=match_thresh,
                       association_thresh=association_thresh)
        self.hasher = hasher
        self.match_thresh = match_thresh
        self.association_thresh = association_thresh
        self.tables = [None]*number_of_tables
        for i in range(number_of_tables):
            self.tables[i] = RandomBinaryProjections(str(i), length_of_tables)
        self.engine = Engine(self.hasher.dims(),
                             lshashes=self.tables,
                             storage=storage(),
                             fetch_vector_filters=[NoVectorFilter()])

    def index(self, id, img):
        item = self.hasher.hash(id, img)
        for i in range(len(item.descriptors)):
            self.engine.store_vector(item.descriptors[i],data=(id, item.keypoints[i], item.descriptors[i]))
        return item

    def find(self, id, img, index_if_not_found=False):
        item = self.hasher.hash(id, img)
        matches = {}
        #count_min =self.association_thresh * float(len(item.descriptors))
        for x in item.descriptors:
            for neighbour in self.engine.neighbours(x):
                if neighbour[1][0] in matches:
                    continue
                y = neighbour[1][2]
                dist = l2norm(x, y)
                key = neighbour[1][0]
                if dist < self.match_thresh:
                    #if dist > 0.0001:
                    #    print('{} {} {}'.format(id, neighbour[1][0], dist))
                    matches[key] = (matches[key] + 1) if key in matches else 1
        if id not in matches and index_if_not_found:
            for i in range(len(item.descriptors)):
                self.engine.store_vector(item.descriptors[i], data=(id, item.keypoints[i], item.descriptors[i]))
        #for id, count in matches.items():
        #    #if count >= count_min:
        #    yield id
        return list(matches.keys())
Ejemplo n.º 31
0
    def start(dataset, test_vector, num_nearest=5):

        # Create a random binary hash with 10 bits
        rbp = RandomBinaryProjections('rbp', 10)

        # Create engine with pipeline configuration
        engine = Engine(dataset.shape, lshashes=[rbp])

        # Index 1000000 random vectors (set their data to a unique string)
        for i, v in dataset:
            engine.store_vector(v, 'data_%d' % i)

        # Get nearest neighbours
        N = engine.neighbours(test_vector)
Ejemplo n.º 32
0
    def _build_rdp_engine(self,matrix,rdp,normals):
        # Dimension of our vector space
        dimension = np.shape(matrix)[1]
        n = np.shape(matrix)[0]
        # Create a random binary hash with 10 bits

        # Create engine with pipeline configuration
        engine = Engine(dimension, lshashes=[rdp],storage = MemoryStorage())
        rdp.vectors = normals

        for index in range(n):
            v = matrix[index]
            engine.store_vector(v, '%d' % index)
            
        return engine
Ejemplo n.º 33
0
 def get_engine(self, vocab, vecs):
     logging.info('{} hash functions'.format(self.args.projections))
     hashes = [
         PCABinaryProjections('ne1v', self.args.projections,
                              vecs[:1000, :].T)
     ]
     engine = Engine(vecs.shape[1],
                     lshashes=hashes,
                     distance=[],
                     vector_filters=[])
     for ind, vec in enumerate(vecs):
         if not ind % 100000:
             logging.info('{} words added to nearpy engine'.format(ind))
         engine.store_vector(vec, ind)
     return engine
Ejemplo n.º 34
0
class lshNN(NNs):
    """
    Locality-sensitive hashing by random projection
        consider some options
    nearpy implementation
    """
    def __init__(self, b=16):
        self.params = {"method": "product quantization, numpy", 'b': b}

    def fit(self, X):
        b = self.params['b']
        self.n, self.f = X.shape
        # Use NearPy lsh for fast ann
        rbp = RandomBinaryProjections('rbp', b)

        self.engine = Engine(self.f, lshashes=[rbp])
        for i in np.arange(self.n):
            v = np.squeeze(np.copy(X[i, :]))
            self.engine.store_vector(v, i)

    def _get_one_knn(self, v, k=3):
        v = np.squeeze(np.copy(v))
        vl = v.shape
        if vl[0] != self.f:
            # print(vl)
            raise Exception("Data Not Match")
        N = self.engine.neighbours(v)
        nni = -np.ones(k, dtype='int')
        nnd = np.empty(k)
        nnd[:] = np.nan
        for i in np.arange(k):
            try:
                nni[i] = N[i][1]
                nnd[i] = N[i][2]
            except IndexError:
                break
        return (nni, nnd)

    def get_knn(self, x, k=3):
        self.n, self.f = x.shape
        nni = -np.ones((self.n, k), dtype='int')
        nnd = np.empty((self.n, k))
        nnd[:] = np.nan
        for i in np.arange(self.n):
            i_i, i_d = self._get_one_knn(x[i, :], k)
            nni[i, :] = i_i
            nnd[i, :] = i_d
        return (nni, nnd)
Ejemplo n.º 35
0
class LSHRandomProjectionsIndex:

    def __init__(self, num_features, projection_count=30):
        self.num_features = num_features
        self.rbp = RandomBinaryProjections('default', projection_count)
        self.text_engine = Engine(num_features, lshashes=[self.rbp], distance=CosineDistance())

    def index(self, vector, key):
        if len(vector) != self.num_features:
            print("ERROR received vector.dim: " + str(len(vector)) + " on engine.dim: " + str(self.num_features))
            raise Exception
        self.text_engine.store_vector(vector, key)

    def query(self, vector):
        res = self.text_engine.neighbours(vector)
        return res
Ejemplo n.º 36
0
def build_lsh(data, hashbits=10):
    # Build a locality sensitive hashed database with (data), of bit-depth (hashbits)

    dimensions = data.shape[1]

    # Create a random binary hash with 10 bits
    rbp = RandomBinaryProjections('rbp', hashbits)

    # Create engine with pipeline configuration
    engine = Engine(dimensions, lshashes=[rbp])

    # Index 1000000 random vectors (set their data to a unique string)
    for index in range(len(data)):
        engine.store_vector(data[index], '%d' % index)

    return engine
Ejemplo n.º 37
0
    def build(self, train, batch_size=64, converter=convert_seq, device=0):
        train_iter = chainer.iterators.SerialIterator(train,
                                                      batch_size,
                                                      repeat=False)
        train_iter.reset()

        act_list = [[] for _ in range(self.n_dknn_layers)]
        label_list = []
        print('caching hiddens')
        n_batches = len(train) // batch_size
        for i, train_batch in enumerate(tqdm(train_iter, total=n_batches)):
            data = converter(train_batch, device=device, with_label=True)
            text = data['xs']
            labels = data['ys']

            with chainer.using_config('train', False):
                _, dknn_layers = self.model.predict(text, dknn=True)
                assert len(dknn_layers) == self.model.n_dknn_layers
            for i in range(self.n_dknn_layers):
                layer = dknn_layers[i]
                layer.to_cpu()
                act_list[i] += [x for x in layer.data]
            label_list.extend([int(x) for x in labels])
        self.act_list = act_list
        self.label_list = label_list

        if self.lsh:
            print('using Locally Sensitive Hashing for NN Search')
        else:
            print('using KDTree for NN Search')
        self.tree_list = []  # one lookup tree for each dknn layer
        for i in range(self.n_dknn_layers):
            print('building tree for layer {}'.format(i))
            if self.lsh:  # if lsh
                n_hidden = act_list[i][0].shape[0]
                rbpt = RandomBinaryProjectionTree('rbpt', 75, 75)
                tree = Engine(n_hidden, lshashes=[rbpt])

                for j, example in enumerate(tqdm(act_list[i])):
                    assert example.ndim == 1
                    assert example.shape[0] == n_hidden

                    tree.store_vector(example, j)
            else:  # if kdtree
                tree = KDTree(act_list[i])

            self.tree_list.append(tree)
Ejemplo n.º 38
0
class TestEngine(unittest.TestCase):
    def setUp(self):
        self.engine = Engine(1000)

    def test_storage_issue(self):
        engine1 = Engine(100)
        engine2 = Engine(100)

        for k in range(1000):
            x = numpy.random.randn(100)
            x_data = 'data'
            engine1.store_vector(x, x_data)

        # Each engine should have its own default storage
        self.assertEqual(len(engine2.storage.buckets), 0)

    def test_retrieval(self):
        for k in range(100):
            self.engine.clean_all_buckets()
            x = numpy.random.randn(1000)
            x_data = 'data'
            self.engine.store_vector(x, x_data)
            n = self.engine.neighbours(x)
            y, y_data, y_distance = n[0]
            normalized_x = unitvec(x)
            delta = 0.000000001
            self.assertAlmostEqual(numpy.abs((normalized_x - y)).max(),
                                   0,
                                   delta=delta)
            self.assertEqual(y_data, x_data)
            self.assertAlmostEqual(y_distance, 0.0, delta=delta)

    def test_retrieval_sparse(self):
        for k in range(100):
            self.engine.clean_all_buckets()
            x = scipy.sparse.rand(1000, 1, density=0.05)
            x_data = 'data'
            self.engine.store_vector(x, x_data)
            n = self.engine.neighbours(x)
            y, y_data, y_distance = n[0]
            normalized_x = unitvec(x)
            delta = 0.000000001
            self.assertAlmostEqual(numpy.abs((normalized_x - y)).max(),
                                   0,
                                   delta=delta)
            self.assertEqual(y_data, x_data)
            self.assertAlmostEqual(y_distance, 0.0, delta=delta)
Ejemplo n.º 39
0
class DB:
    def __init__(self, feature_size=16, nearest_neighbours=1000):
        self.feature_size = feature_size
        self.nn = nearest_neighbours
        self.engine = None
        self.load_hashmap()

    def load_hashmap(self):
        # Create redis storage adapter
        # need to start redis service
        redis_object = Redis(host='localhost', port=6379, db=14)
        redis_storage = RedisStorage(redis_object)
        try:
            config = redis_storage.load_hash_configuration('test')
            lshash = RandomBinaryProjections(None, None)
            lshash.apply_config(config)

        except:
            # Config is not existing, create hash from scratch, with 10 projections
            lshash = RandomBinaryProjections('test', 10)

        nearest = NearestFilter(self.nn)
        # self.engine = Engine(feature_size, lshashes=[], vector_filters=[])
        self.engine = Engine(self.feature_size,
                             lshashes=[lshash],
                             vector_filters=[nearest],
                             storage=redis_storage,
                             distance=CosineDistance())

        # Do some stuff like indexing or querying with the engine...

        # Finally store hash configuration in redis for later use
        redis_storage.store_hash_configuration(lshash)

    def query(self, fvector):
        query = np.asarray(fvector)

        # get nn nearest neighbours
        # a list of tuple (data, name, distance)
        N = self.engine.neighbours(query)
        return N

    def append_to_DB(self, fvector, name=""):
        if fvector is None:
            return
        self.engine.store_vector(np.asarray(fvector), name)
Ejemplo n.º 40
0
def create_hashing(sets):

    # Dimension of the vector space.
    dimension = len(sets[0][0])

    # Create a random binary hash with 10 bits
    rbp = RandomBinaryProjections('rbp', 10)

    # Create engine with pipeline configuration
    engine = Engine(dimension, lshashes=[rbp])

    # Index all our values (set their data to a unique string).
    for index, s in enumerate(sets):
        for v in s:
            engine.store_vector(v, 'data_%d' % index)

    return engine
Ejemplo n.º 41
0
def test_sparse():
    dim = 500
    num_train = 1000
    num_test = 1
    train_data = ss.rand(dim, num_train)#pickle.load('/home/jmahler/Downloads/feature_objects.p')
    test_data = ss.rand(dim, num_test)

    rbp = RandomBinaryProjections('rbp', 10)
    engine = Engine(dim, lshashes=[rbp])

    for i in range(num_train):
        engine.store_vector(train_data.getcol(i))

    for j in range(num_test):
        N = engine.neighbours(test_data.getcol(j))
        print N

    IPython.embed()
Ejemplo n.º 42
0
class TestEngine(unittest.TestCase):

    def setUp(self):
        self.engine = Engine(1000)

    def test_storage_issue(self):
        engine1 = Engine(100)
        engine2 = Engine(100)

        for k in range(1000):
            x = numpy.random.randn(100)
            x_data = 'data'
            engine1.store_vector(x, x_data)

        # Each engine should have its own default storage
        self.assertTrue(len(engine2.storage.buckets)==0)

    def test_retrieval(self):
        for k in range(100):
            self.engine.clean_all_buckets()
            x = numpy.random.randn(1000)
            x_data = 'data'
            self.engine.store_vector(x, x_data)
            n = self.engine.neighbours(x)
            y, y_data, y_distance  = n[0]
            normalized_x = unitvec(x)
            delta = 0.000000001
            self.assertAlmostEqual(numpy.abs((normalized_x - y)).max(), 0, delta=delta)
            self.assertEqual(y_data, x_data)
            self.assertAlmostEqual(y_distance, 0.0, delta=delta)

    def test_retrieval_sparse(self):
        for k in range(100):
            self.engine.clean_all_buckets()
            x = scipy.sparse.rand(1000, 1, density=0.05)
            x_data = 'data'
            self.engine.store_vector(x, x_data)
            n = self.engine.neighbours(x)
            y, y_data, y_distance = n[0]
            normalized_x = unitvec(x)
            delta = 0.000000001
            self.assertAlmostEqual(numpy.abs((normalized_x - y)).max(), 0, delta=delta)
            self.assertEqual(y_data, x_data)
            self.assertAlmostEqual(y_distance, 0.0, delta=delta)
Ejemplo n.º 43
0
class TestEngine(unittest.TestCase):

    def setUp(self):
        self.engine = Engine(1000)

    def test_storage_issue(self):
        engine1 = Engine(100)
        engine2 = Engine(100)

        for k in range(1000):
            x = numpy.random.randn(100)
            x_data = 'data'
            engine1.store_vector(x, x_data)

        # Each engine should have its own default storage
        self.assertTrue(len(engine2.storage.buckets)==0)

    def test_retrieval(self):
        for k in range(100):
            self.engine.clean_all_buckets()
            x = numpy.random.randn(1000)
            x_data = 'data'
            self.engine.store_vector(x, x_data)
            n = self.engine.neighbours(x)
            y = n[0][0]
            y_data = n[0][1]
            y_distance = n[0][2]
            self.assertTrue((y == x).all())
            self.assertEqual(y_data, x_data)
            self.assertEqual(y_distance, 0.0)

    def test_retrieval_sparse(self):
        for k in range(100):
            self.engine.clean_all_buckets()
            x = scipy.sparse.rand(1000, 1, density=0.05)
            x_data = 'data'
            self.engine.store_vector(x, x_data)
            n = self.engine.neighbours(x)
            y = n[0][0]
            y_data = n[0][1]
            y_distance = n[0][2]
            self.assertTrue((y - x).sum() == 0.0)
            self.assertEqual(y_data, x_data)
            self.assertEqual(y_distance, 0.0)
Ejemplo n.º 44
0
class lshsearcher:
    def __init__(self):
        self.__dimension = None
        self.__engine_perm = None
        self.__permutations = None

    def _set_confval(self, dimension=None):
        if dimension is None:
            return None
        else:
            self.__dimension = dimension

    def _engine_on(self):
        # Create permutations meta-hash
        self.__permutations = HashPermutations('permut')

        # Create binary hash as child hash
        rbp_perm = RandomBinaryProjections('rbp_perm', 14)
        rbp_conf = {'num_permutation':50,'beam_size':10,'num_neighbour':100}

        # Add rbp as child hash of permutations hash
        self.__permutations.add_child_hash(rbp_perm, rbp_conf)

        # Create engine
        self.__engine_perm = Engine(self.__dimension, lshashes=[self.__permutations], distance=CosineDistance())

    def conf(self, dimension):
        self._set_confval(dimension)
        self._engine_on()

    def getData(self, v):
        if self.__engine_perm is not None:
            self.__engine_perm.store_vector(v)

    def commitData(self):
        if self.__permutations is not None:
            self.__permutations.build_permuted_index()

    def find(self, v):
        if self.__engine_perm is not None:
            return self.__engine_perm.neighbours(v)
Ejemplo n.º 45
0
class testing_suite:
	"""
	Class to test SDF files in a nearest neighbor lookup format, under different models of representation 
	such as PCA, FactorAnalysis, KernelPCA with the rbf kernel, FastICA, and DictionaryLearning

	Sample Usage:

		test=testing_suite()
		test.adddir("/mnt/terastation/shape_data/Cat50_ModelDatabase/screwdriver")
		num_train=12
		num_test=4

		test.make_train_test(num_train,num_test)
		accuracy,results=test.perform_PCA_tests()
	"""

	def __init__(self):
		self.PCA_changed_=True
		self.FA_changed_=True
		self.KPCA_changed_=True
		self.FICA_changed_=True
		self.DL_changed_=True
		self.all_files_=[]
		self.PCA_=None
		self.FA_ = None
		self.KPCA_ = None
		self.FICA_ = None
		self.DL_ = []
		self.testing_=[]
		self.training_=[]
		self.engine_=[]
		self.training_vectors_=None
		self.confusion_={}
		self.biggest=0

	def adddir(self,dir_to_add):
		"""
			add all sdf filepaths from a root directory tree (dir_to_add) to the all_files_
			instance variable
		"""
		sdf_files = []
		for root,dirs,files in walk(dir_to_add):
			for file_ in files:
				if file_.endswith("25.sdf"):
					sdf_files.append(path.join(root,file_))
		self.all_files_+=sdf_files

	def adddir_25(self,dir_to_add):
		"""add files in a directory only with dimension 12"""
		sdf_files = []
		for root,dirs,files in walk(dir_to_add):
			for file_ in files:
				if file_.endswith(".sdf"):
					tempsdf=SDF(path.join(root,file_))
					if tempsdf.dimensions()[0]==25*25*25:
						sdf_files.append(path.join(root,file_))
		self.all_files_+=sdf_files

	def addfile(self,file_to_add):
		"""add only one file to all_files"""
		self.all_files_.append(file_to_add)

	def make_train_test(self,num_train, num_test):
		"""
		populates the list of training files and testing files with filepaths based on a random
		number generator seeded with np.random.seed(100)

		Sample Usage:
			test=testing_suite()
			test.adddir("/mnt/terastation/shape_data/Cat50_ModelDatabase/screwdriver")
			num_train=12
			num_test=4

			test.make_train_test(num_train,num_test)
		"""
		assert num_train+num_test<=len(self.all_files_)
		np.random.seed(100)
		permuted_indices = np.random.permutation(len(self.all_files_))
		get_training = itemgetter(*permuted_indices[:num_train])
		get_testing = itemgetter(*permuted_indices[num_train:num_train+num_test])
		if num_train > 1:
			self.training_ = get_training(self.all_files_)
		else:
			self.training_= [get_training(self.all_files_)]


		if num_test > 1:
			self.testing_ = get_testing(self.all_files_)
		else:
			self.testing_ = [get_testing(self.all_files_)]

	def normalize_vector(self,vector,largest_dimension):
		"""normalizes smaller sdf vectors to a larger size by vertical stacking a column of zeros underneath"""
		return np.vstack((vector,np.zeros((largest_dimension-vector.shape[0],1))))

	def get_PCA_training_vectors(self):
		"""
		gets all training_vectors from the set of training files, normalizes them using normalize 
		vector and adds them all to a numpy array that gets returned
		"""
		training_sdf=[SDF(i) for i in list(self.training_)]
		
		self.biggest=0
		for item in training_sdf:
			self.biggest=max(self.biggest,item.dimensions()[0])
		return_train_vectors=None
		for tempsdf in training_sdf:
			vectorized=np.reshape(tempsdf.data(),(tempsdf.dimensions()[0],1))
			normal_vector=self.normalize_vector(vectorized,self.biggest)
			if return_train_vectors==None:
				return_train_vectors=normal_vector
			else:
				return_train_vectors=np.concatenate((return_train_vectors,normal_vector),axis=1)
		return return_train_vectors

	"""
	-any function begining with make creates the sklearn.decomposition framework for the specified 
	decomposition type 
	-any function begining with fit fits the training vectors to the decomposition framework
	-any function begining with transform transforms the training vectors based on the fitted 
	decomposition framework
	"""

        def render_sdf(self, a, thresh = 1e-3):
                h = plt.figure()
                ax = h.add_subplot(111, projection = '3d')

                surface_points = np.where(np.abs(a) < thresh)

                x = surface_points[0]
                y = surface_points[1]
                z = surface_points[2]
                ax.scatter(x, y, z)

                ax.set_xlabel('X')
                ax.set_ylabel('Y')
                ax.set_zlabel('Z')
                ax.set_xlim3d(0,a.shape[0])
                ax.set_ylim3d(0,a.shape[1])
                ax.set_zlim3d(0,a.shape[2])

                plt.show()


	def make_PCA(self):
		self.PCA_=skdec.PCA()#n_components='mle')

	def fit_PCA(self,training_vectors):
		self.PCA_.fit(training_vectors)                
                
	def make_FA(self):
		self.FA_=skdec.FactorAnalysis(n_components=len(list(self.training_)))

	def fit_FA(self,training_vectors):
		self.FA_.fit(training_vectors)

	def make_KPCA(self,kernel_option="rbf"):
		self.KPCA_=skdec.KernelPCA(gamma=0.1, kernel=kernel_option)

	def fit_KPCA(self,training_vectors):
		self.KPCA_.fit(training_vectors)

	def make_FICA(self):
		self.FICA_=skdec.FastICA(n_components=len(list(self.training_)))

	def fit_FICA(self,training_vectors):
		self.FICA_.fit(training_vectors)

	def make_DL(self,alpha_values):
		self.DL_.append(skdec.DictionaryLearning(n_components=len(list(self.training_)),alpha= alpha_values,transform_algorithm = 'omp'))

	def fit_DL(self,training_vectors):
		self.DL_[-1].fit(training_vectors)

	def load_PCA(self,vector_set):
		"""reinitializes our engine and loads a numpy set of vectors of dimension (self.biggest,1) 
		into self.engine_"""
		rbp = RandomBinaryProjections('rbp', 10)
		self.engine_ = Engine(self.PCA_.components_.shape[1], lshashes=[rbp])
                transformed_vectors = self.PCA_.transform(vector_set.T)
		for i in range(len(list(self.training_))):
			#vector=vector_set[:,i]                        
			#vector=np.reshape(vector,(self.biggest,1))
			#vector=self.PCA_.transform(vector)
			self.engine_.store_vector(transformed_vectors[i,:], self.training_[i])

	def load_FA(self,vector_set):
		rbp = RandomBinaryProjections('rbp',10)
		self.engine_ = Engine(self.biggest, lshashes=[rbp])
		for i in range(len(list(self.training_))):
			vector=vector_set[:,i]
			vector=np.reshape(vector,(self.biggest,1))
			vector=self.FA_.transform(vector)
			self.engine_.store_vector(vector[:,0],self.training_[i])

	def load_KPCA(self,vector_set):
		rbp = RandomBinaryProjections('rbp',10)
		self.engine_ = Engine(self.KPCA_.alphas_.shape[1], lshashes=[rbp])
                transformed_vectors = self.KPCA_.transform(vector_set.T)
		for i in range(len(list(self.training_))):
			#vector=vector_set[:,i]
			#vector=np.reshape(vector,(self.biggest,1))
			#vector=self.KPCA_.transform(vector)
			self.engine_.store_vector(transformed_vectors[i,:], self.training_[i])

	def load_FICA(self,vector_set):
		rbp = RandomBinaryProjections('rbp',10)
		self.engine_ = Engine(self.biggest, lshashes=[rbp])
		for i in range(len(list(self.training_))):
			vector=vector_set[:,i]
			vector=np.reshape(vector,(self.biggest,1))
			vector=self.FICA_.transform(vector)
			self.engine_.store_vector(vector[:,0],self.training_[i])

	def load_DL(self,vector_set):
		rbp = RandomBinaryProjections('rbp',10)
		self.engine_ = Engine(self.biggest, lshashes=[rbp])
		for i in range(len(list(self.training_))):
			vector=vector_set[:,i]
			vector=np.reshape(vector,(self.biggest,1))
			vector=self.DL_[-1].transform(vector)
			self.engine_.store_vector(vector[:,0],self.training_[i])		

	def engine_query(self,test_vector):
		"""
		queries the engine with a (self.biggest,1) dimension vector and returns the file_names of nearest
		neighbors and the results
		"""
		#print test_vector
		#reshaped=np.reshape(test_vector,(self.biggest,1))
		results = self.engine_.neighbours(test_vector.T)
		file_names = [i[1] for i in results]
		return file_names, results

	def setup_confusion(self):
		"""
		reinitializes the self.confusion_ confusion matrix variable
		"""
		self.confusion_={}
		self.confusion_[UNKNOWN_TAG] = {}
		for file_ in self.all_files_:
			category = cat50_file_category(file_)
			self.confusion_[category] = {}
		for query_cat in self.confusion_.keys():
			for pred_cat in self.confusion_.keys():
				self.confusion_[query_cat][pred_cat] = 0

	"""
	Makes a test vector by taking in an SDF, reshaping it, normalizing it, then returns a transformed
	version of that vector based on the corresponding decomposition model that was already trained
	"""

	def make_test_vector(self,sdf_array,vector_type):
		if vector_type=="PCA":
 			return self.make_PCA_test_vector(sdf_array)
		elif vector_type=="FA":
			return self.make_FA_test_vector(sdf_array)
		elif vector_type=="KPCA":
			return self.make_KPCA_test_vector(sdf_array)
		elif vector_type=="FICA":
			return self.make_FICA_test_vector(sdf_array)
		elif vector_type=="DL":
			return self.make_DL_test_vector(sdf_array)

	def make_DL_test_vector(self,sdf_array):
		reshaped=np.reshape(sdf_array.data(),(sdf_array.dimensions()[0],1))
		normalized=self.normalize_vector(reshaped,self.biggest)
		return self.DL_[-1].transform(normalized)[:,0]

	def make_FICA_test_vector(self,sdf_array):
		reshaped=np.reshape(sdf_array.data(),(sdf_array.dimensions()[0],1))
		normalized=self.normalize_vector(reshaped,self.biggest)
		return self.FICA_.transform(normalized)[:,0]

	def make_KPCA_test_vector(self,sdf_array):
		reshaped=np.reshape(sdf_array.data(),(sdf_array.dimensions()[0],1))
                return self.KPCA_.transform(reshaped.T)
#		reshaped=np.reshape(sdf_array.data(),(sdf_array.dimensions()[0],1))
#		normalized=self.normalize_vector(reshaped,self.biggest)
#		return self.KPCA_.transform(normalized)[:,0]

	def make_FA_test_vector(self,sdf_array):
		reshaped=np.reshape(sdf_array.data(),(sdf_array.dimensions()[0],1))
		normalized=self.normalize_vector(reshaped,self.biggest)
		return self.FA_.transform(normalized)[:,0]

	def make_PCA_test_vector(self,sdf_array):
		reshaped=np.reshape(sdf_array.data(),(sdf_array.dimensions()[0],1))
                return self.PCA_.transform(reshaped.T)
#               IPython.embed()
		
#		normalized=self.normalize_vector(reshaped,self.biggest)
		
#		return self.PCA_.transform(normalized)[:,0]

	"""
	querys the loaded and trained engine with each of your test vectors from make_train_test
		Returns
	        accuracy: float representing the accuracy of querying the nearpy engine with the test results
	        test_results: dictionary of the results from the "testing" for each of the sdf_files 
	"""
	def perform_tests(self,K,test_type):
		test_results={}
		for file_ in list(self.testing_):
			query_category=cat50_file_category(file_)
			print "Querying: %s with category %s "%(file_, query_category)
			converted = SDF(file_)
			test_vector=self.make_test_vector(converted,test_type)
			closest_names, closest_vals=self.engine_query(test_vector.T[:,0])

			pred_category=UNKNOWN_TAG

			if len(closest_names)>0:
				closest_category=closest_names[0]
				pred_category=cat50_file_category(closest_category)

				for i in range(1,min(K,len(closest_names))):
					closest_category = closest_names[i]
					potential_category = cat50_file_category(closest_category)

					if potential_category == query_category:
						pred_category = potential_category
			print "Result Category: %s"%(pred_category)

			self.confusion_[query_category][pred_category] += 1
			test_results[file_]= [(closest_names, closest_vals)]

		row_names=self.confusion_.keys()
		confusion_mat=np.zeros([len(row_names),len(row_names)])
		i=0
		for query_cat in self.confusion_.keys():
			j = 0
			for pred_cat in self.confusion_.keys():
				confusion_mat[i,j] = self.confusion_[query_cat][pred_cat]
				j += 1
			i += 1

	    # get true positives, etc for each category
		num_preds = len(self.testing_)
		tp = np.diag(confusion_mat)
		fp = np.sum(confusion_mat, axis=0) - np.diag(confusion_mat)
		fn = np.sum(confusion_mat, axis=1) - np.diag(confusion_mat)
		tn = num_preds * np.ones(tp.shape) - tp - fp - fn

	    # compute useful statistics
		recall = tp / (tp + fn)
		tnr = tn / (fp + tn)
		precision = tp / (tp + fp)
		npv = tn / (tn + fn)
		fpr = fp / (fp + tn)
		accuracy = np.sum(tp) / num_preds # correct predictions over entire dataset

	    # remove nans
		recall[np.isnan(recall)] = 0
		tnr[np.isnan(tnr)] = 0
		precision[np.isnan(precision)] = 0
		npv[np.isnan(npv)] = 0
		fpr[np.isnan(fpr)] = 0

		return accuracy, test_results, recall, tnr, precision,npv,fpr


        def vis_pca_components(self, num_comp_vis, thresh = 0.01, method = 'PCA'):
                PCA = self.PCA_
                if method == 'KPCA':
                        PCA = self.KPCA_
                num_components = PCA.components_.shape[0]
                num_components = min(num_comp_vis, num_components)

                comp_per_dim = int(math.ceil(math.sqrt(num_components)))
                h = plt.figure()
                for i in range(num_components):
                        ax = h.add_subplot(comp_per_dim, comp_per_dim, i+1, projection = '3d')
                        components = PCA.components_[i,:]
                        comp_grid = components.reshape(25, 25, 25)
                        
                        surface_points = np.where(np.abs(comp_grid) < thresh)
                        x = surface_points[0]
                        y = surface_points[1]
                        z = surface_points[2]

                        ax.scatter(x, y, z)
                        ax.set_xlabel('X')
                        ax.set_ylabel('Y')
                        ax.set_zlabel('Z')
                        ax.set_xlim3d(0,25)
                        ax.set_ylim3d(0,25)
                        ax.set_zlim3d(0,25)
                        ax.set_title('Component %d'%(i))
                plt.show()

        def vis_pca_component_slices(self, num_comp_vis, method = 'PCA'):
                PCA = self.PCA_
                if method == 'KPCA':
                        PCA = self.KPCA_
                num_components = PCA.components_.shape[0]
                num_components = min(num_comp_vis, num_components)

                comp_per_dim = int(math.ceil(math.sqrt(num_components)))
                plt.figure()
                for i in range(num_components):
                        plt.subplot(comp_per_dim, comp_per_dim, i+1)
                        components = PCA.components_[i,:]
                        comp_grid = components.reshape(25, 25, 25)
                        comp_slice = comp_grid[:,:,12]

                        plt.imshow(comp_slice)
                        plt.title('Component %d XY Plane'%(i))

                plt.figure()
                for i in range(num_components):
                        plt.subplot(comp_per_dim, comp_per_dim, i+1)
                        components = PCA.components_[i,:]
                        comp_grid = components.reshape(25, 25, 25)
                        comp_slice = comp_grid[:,12,:]

                        plt.imshow(comp_slice)
                        plt.title('Component %d XZ Plane'%(i))

                plt.figure()
                for i in range(num_components):
                        plt.subplot(comp_per_dim, comp_per_dim, i+1)
                        components = PCA.components_[i,:]
                        comp_grid = components.reshape(25, 25, 25)
                        comp_slice = comp_grid[12,:,:]

                        plt.imshow(comp_slice)
                        plt.title('Component %d YZ Plane'%(i))
                plt.show()


	"""
	runs perform_tests on a specific type of decomposition after creating that decomposition type 
	framework with the training vectors and loading those training vectors into the engine

	K is the number of neighbors to check
	"""
	def perform_PCA_tests(self,K):
		train_vectors=self.get_PCA_training_vectors()
		self.make_PCA()
                print 'Fitting PCA'
		self.fit_PCA(train_vectors.T)
                print 'Loading PCA'
		self.load_PCA(train_vectors)
                print 'Setup confusion'
		self.setup_confusion()
                print 'Eval accuracy'
                #IPython.embed()
		accuracy,test_results, recall, tnr, precision,npv,fpr=self.perform_tests(K,"PCA")
		return accuracy,test_results, recall, tnr, precision,npv,fpr

	def perform_FA_tests(self,K):
		train_vectors=self.get_PCA_training_vectors()
		self.make_FA()
		self.fit_FA(train_vectors)
		self.load_FA(train_vectors)
		self.setup_confusion()
		accuracy,test_results, recall, tnr, precision,npv,fpr=self.perform_tests(K,"FA")
		return accuracy,test_results, recall, tnr, precision,npv,fpr

	def perform_KPCA_tests(self,K,kernel="rbf"):
		train_vectors=self.get_PCA_training_vectors()
		self.make_KPCA(kernel_option=kernel)
                print 'Fitting KCPA'
		self.fit_KPCA(train_vectors.T)
                print 'Loading KPCA'
		self.load_KPCA(train_vectors)
		self.setup_confusion()
		accuracy,test_results, recall, tnr, precision,npv,fpr=self.perform_tests(K,"KPCA")
                IPython.embed()
		return accuracy,test_results, recall, tnr, precision,npv,fpr


	def perform_FICA_tests(self,K):
		train_vectors=self.get_PCA_training_vectors()
		self.make_FICA()
		self.fit_FICA(train_vectors)
		self.load_FICA(train_vectors)
		self.setup_confusion()
		accuracy,test_results, recall, tnr, precision,npv,fpr=self.perform_tests(K,"FICA")
		return accuracy,test_results, recall, tnr, precision,npv,fpr

	def perform_DL_tests(self,K,alpha):
		train_vectors=self.get_PCA_training_vectors()
		self.make_DL(alpha_values=alpha)
		self.fit_DL(train_vectors)
		self.load_DL(train_vectors)
		self.setup_confusion()
		accuracy,test_results, recall, tnr, precision,npv,fpr=self.perform_tests(K,"DL")
		return accuracy,test_results, recall, tnr, precision,npv,fpr

	def get_engine(self):
		return self.engine_

	def get_PCA(self):
		return self.PCA_

	def get_FA(self):
		return self.FA_

	def get_KPCA(self):
		return self.KPCA_

	def get_FICA(self):
		return self.FICA_

	def get_DL(self):
		return self.DL_

	def get_explained_variance_ratio(self):
		return self.PCA_.explained_variance_ratio_
Ejemplo n.º 46
0
# Create a random binary hash with 10 bits
rbp = RandomBinaryProjections('rbp', 10)

# Create engine with pipeline configuration
redis_storage = RedisStorage(Redis(host='localhost', port=6379, db=0))
engine = Engine(dimension, lshashes=[rbp], storage=redis_storage)

index = 0
with open("Adele.csv", "rb") as csvfile:
    featurereader = csv.reader(csvfile, delimiter=',')
    for row in featurereader:
        index = index+1
        x = numpy.array(row, dtype='|S4')
        y = x.astype(numpy.float)
        engine.store_vector(y, 'Adele - Hello_%d' % index)

index=0        
with open("BlurredLines.csv", "rb") as csvfile:
    featurereader = csv.reader(csvfile, delimiter=',')
    for row in featurereader:
        index = index+1
        x = numpy.array(row, dtype='|S4')
        y = x.astype(numpy.float)
        engine.store_vector(y, 'Robin Thicke - Blurred Lines ft. T.I. Pharrell_%d' % index)

index=0        
with open("CallMeMaybe.csv", "rb") as csvfile:
    featurereader = csv.reader(csvfile, delimiter=',')
    for row in featurereader:
        index = index+1
Ejemplo n.º 47
0
class LSHSearch:
    def __init__(self, feature_file, dimension, neighbour, lsh_project_num):
        self.feature_file = feature_file
        self.dimension = dimension
        self.neighbour = neighbour
        self.face_feature = defaultdict(str)
        self.ground_truth = defaultdict(int)

        # Create permutations meta-hash
        permutations2 = HashPermutationMapper('permut2')

        tmp_feature = defaultdict(str)
        with open(feature_file, 'rb') as f:
            reader = csv.reader(f, delimiter=' ')
            for name, feature in reader:
                tmp_feature[name] = feature

        matrix = []
        label = []
        for item in tmp_feature.keys():
            v = map(float, tmp_feature[item].split(','))
            matrix.append(np.array(v))
            label.append(item)
        random.shuffle(matrix)
        print 'PCA matric : ', len(matrix)

        rbp_perm2 = PCABinaryProjections('testPCABPHash', lsh_project_num, matrix)
        permutations2.add_child_hash(rbp_perm2)

        # Create engine
        nearest = NearestFilter(self.neighbour)
        self.engine = Engine(self.dimension, lshashes=[permutations2], distance=CosineDistance(), vector_filters=[nearest])

    def build(self):
        with open(self.feature_file, 'rb') as f:
            reader = csv.reader(f, delimiter=' ')
            for name, feature in reader:
                self.face_feature[name] = feature
                person = '_'.join(name.split('_')[:-1])
                self.ground_truth[person] += 1 

        for item in self.face_feature.keys():
            v = map(float, self.face_feature[item].split(','))
            self.engine.store_vector(v, item)
 
    def query(self, person_list):
        dists = []
        scores = []
        for person in person_list:
            query = map(float, self.face_feature[person].split(','))
            print '\nNeighbour distances with mutliple binary hashes:'
            print '  -> Candidate count is %d' % self.engine.candidate_count(query)
            results = self.engine.neighbours(query)
            dists = dists + [x[1] for x in results]
            scores = scores + [x[2] for x in results]
        t_num = [self.ground_truth['_'.join(x.split('_')[:-1])] for x in dists]
        res = zip(dists, scores, t_num)
        res.sort(key = lambda t: t[1])
        res1 = self.f7(res, person_list)
        return res1[:self.neighbour]

    def true_num(self, person):
        return self.ground_truth[person]

    def f7(self, zip_seq, person_list):
        seen = set()
        seen_add = seen.add
        return [ x for x in zip_seq if not (x[0] in seen or seen_add(x[0]) or x[0] in person_list)]
Ejemplo n.º 48
0
from nearpy.distances import EuclideanDistance
from nearpy.storage import MemoryStorage
from nearpy import Engine

#from redis import Redis
#from nearpy.storage import RedisStorage
from nearpy.storage import GonzaloStorage

#load the visual features of all the images from the dataset
featIN=h5py.File('featIN.mat')['featIN']

#Create binary projections and save them in HD
rbp = RandomBinaryProjections('rbp', 10)
dimension=4096

#Trying redis
gonzalo_storage = GonzaloStorage()
engine = Engine(dimension, lshashes=[rbp], distance=EuclideanDistance(),vector_filters=[NearestFilter(20)],  storage=gonzalo_storage)

fp = open('engine.txt', 'w')
pickle.dump(engine, fp)
fp.close()

#engine = Engine(dimension, lshashes=[rbp])
for index in range(1000000):
 v=featIN[range(dimension),index]
 #v=numpy.float16(featIN[range(dimension),index])
 engine.store_vector(v, 'data_%d' % index)

engine.save_all()
Ejemplo n.º 49
0
def example1():

    # Dimension of feature space
    DIM = 100

    # Number of data points (dont do too much because of exact search)
    POINTS = 10000

    print 'Creating engines'

    # We want 12 projections, 20 results at least
    rbpt = RandomBinaryProjectionTree('rbpt', 20, 20)

    # Create engine 1
    engine_rbpt = Engine(DIM, lshashes=[rbpt], distance=CosineDistance())

    # Create binary hash as child hash
    rbp = RandomBinaryProjections('rbp1', 20)

    # Create engine 2
    engine = Engine(DIM, lshashes=[rbp], distance=CosineDistance())

    # Create permutations meta-hash
    permutations = HashPermutations('permut')

    # Create binary hash as child hash
    rbp_perm = RandomBinaryProjections('rbp_perm', 20)
    rbp_conf = {'num_permutation':50,'beam_size':10,'num_neighbour':100}

    # Add rbp as child hash of permutations hash
    permutations.add_child_hash(rbp_perm, rbp_conf)

    # Create engine 3
    engine_perm = Engine(DIM, lshashes=[permutations], distance=CosineDistance())

    # Create permutations meta-hash
    permutations2 = HashPermutationMapper('permut2')

    # Create binary hash as child hash
    rbp_perm2 = RandomBinaryProjections('rbp_perm2', 12)

    # Add rbp as child hash of permutations hash
    permutations2.add_child_hash(rbp_perm2)

    # Create engine 3
    engine_perm2 = Engine(DIM, lshashes=[permutations2], distance=CosineDistance())

    print 'Indexing %d random vectors of dimension %d' % (POINTS, DIM)

    # First index some random vectors
    matrix = numpy.zeros((POINTS,DIM))
    for i in xrange(POINTS):
        v = numpy.random.randn(DIM)
        matrix[i] = v
        engine.store_vector(v)
        engine_rbpt.store_vector(v)
        engine_perm.store_vector(v)
        engine_perm2.store_vector(v)

    print 'Buckets 1 = %d' % len(engine.storage.buckets['rbp1'].keys())
    print 'Buckets 2 = %d' % len(engine_rbpt.storage.buckets['rbpt'].keys())

    print 'Building permuted index for HashPermutations'

    # Then update permuted index
    permutations.build_permuted_index()

    print 'Generate random data'

    # Get random query vector
    query = numpy.random.randn(DIM)

    # Do random query on engine 1
    print '\nNeighbour distances with RandomBinaryProjectionTree:'
    print '  -> Candidate count is %d' % engine_rbpt.candidate_count(query)
    results = engine_rbpt.neighbours(query)
    dists = [x[2] for x in results]
    print dists

    # Do random query on engine 2
    print '\nNeighbour distances with RandomBinaryProjections:'
    print '  -> Candidate count is %d' % engine.candidate_count(query)
    results = engine.neighbours(query)
    dists = [x[2] for x in results]
    print dists

    # Do random query on engine 3
    print '\nNeighbour distances with HashPermutations:'
    print '  -> Candidate count is %d' % engine_perm.candidate_count(query)
    results = engine_perm.neighbours(query)
    dists = [x[2] for x in results]
    print dists

    # Do random query on engine 4
    print '\nNeighbour distances with HashPermutations2:'
    print '  -> Candidate count is %d' % engine_perm2.candidate_count(query)
    results = engine_perm2.neighbours(query)
    dists = [x[2] for x in results]
    print dists

    # Real neighbours
    print '\nReal neighbour distances:'
    query = query.reshape((1,DIM))
    dists = CosineDistance().distance_matrix(matrix,query)
    dists = dists.reshape((-1,))
    dists = sorted(dists)
    print dists[:10]
Ejemplo n.º 50
0
class RMAX_repr(Representation):
    """
    Identical to Tabular representation (ie assigns a binary feature function 
    f_{d}() to each possible discrete state *d* in the domain, with
    f_{d}(s) = 1 when d=s, 0 elsewhere.
    HOWEVER, unlike *Tabular*, feature functions are only created for *s* which
    have been encountered in the domain, not instantiated for every single 
    state at the outset.

    """
    def __init__(self, domain, Rmax, LQ, k = 1, epsilon_d = 0.01):
        # LQ is the lipschitz constant - 10**3 according to the paper (by Cross Validn)
        self.LQ = LQ
        self.gamma = domain.discount_factor

        self.rmax = Rmax
        self.qmax = Rmax / (1-self.gamma)
        self.qmax_tilda = Rmax + self.gamma * self.qmax
        self.epsilon = epsilon_d

        # Approximate k-NN is used when finding the Q value of a point
        self.k = k

        # We also keep track of the states sampled so far
        self.sample_list = [0]*(2*100000)
        self.list_idx = 0
        # And a dictionary for quick lookups of already computed values
        self.sample_values = {}

        # And we use an LSH to find the approximate k-Nearest neighbours
        # by training it on every s, a, r, s' tuple we see
        self.init_randomization()
        
        super(
            RMAX_repr,
            self).__init__(
            domain)

    
    def init_randomization(self):
        rbp = RandomBinaryProjections('rbp', 10)
        from nearpy.distances import ChebyshevDistance
        self.engine = Engine(7, lshashes = [rbp], vector_filters=[NearestFilter(self.k)], distance=ChebyshevDistance())

    def is_known(self, s, a):
        # A s, a pair is 'known' if LQ * d(s, a, s', a') < epsilon_d
        indices = self.approx_nn(s, a)
        if not indices:
            return False

        for idx in indices:
            s_p, a_p = self.sample_list[idx]
            if self.LQ * self.d(s, a, s_p, a_p) > self.epsilon:
                return False
        return True

    def pre_discover(self, s, p_terminal, a, r, ns, terminal):
        # In the learning stage, if sa is not 'known' add it to the sample list
        # and its value to sample value.
        if not self.is_known(s, a):
            x = r + self.gamma * max(self.Q_tilda(ns, a_p) for a_p in range(self.actions_num))

            self.engine.store_vector(np.append(s, a), self.list_idx)            
            self.sample_list[self.list_idx]= (s, a)
            self.list_idx+=1
            self.sample_values[self.sa_tuple(s, a)] = x

            #self.LSH.partial_fit(np.append(s, a))
        super(RMAX_repr, self).pre_discover(s, p_terminal, a, ns, terminal)

    # Compute a distance metric between (s, a) and (ns, na).
    # Using max-norm as in the paper for now.
    def d(self, s, a, ns, na):
        # Create one big s,a array
        sa = np.append(s, a)
        nsa = np.append(ns, na)
        # Use scipy to compute the chebyshev distance => Max norm
        return distance(sa, nsa)

    def approx_nn(self, s, a):
        #dist, indices = self.LSH.kneighbors(np.append(s, a))
        # returns a list of
        l = self.engine.neighbours(np.append(s, a))
        indices = [elem[1] for elem in l]
        return indices

    def sa_tuple(self, s, a):
        return tuple(np.append(s, a))
    
    # The approximate Q function 
    def Q_tilda(self, s, a):
        k = self.k
        q = 0.0
        # First get the k-nearest sampled neighbours to this point using LSH
        indices = self.approx_nn(s, a)
        num_neighbors = 0

        for index in indices:
            sj, aj = self.sample_list[index]
            dij = self.d(s, a, sj, aj)
            if dij <= (self.qmax / self.LQ):
                xj = self.sample_values[self.sa_tuple(sj, aj)]
                q += dij * self.LQ + xj
                num_neighbors += 1

        # In case there were less than k neighbors - Use Qmax_tilda for the remaining
        for i in range(num_neighbors, k):
            q += self.qmax_tilda
        # Return the average Q
        return q/k
        

    def Qs(self, s, terminal, phi_s=None):
        # Q -> Array of Q(s, a) values for this state
        # A -> Corresponding IDs

        # Before any learning is done, the experiment calls the policy to
        # estimate prior performance. In that case, the LSHF would throw a 
        # Value Error. We pre-empt that here
        Q = np.zeros((self.actions_num))
        #try :
        #    self.LSH.kneighbors(np.append(s, 0))
        #except ValueError:
        #    return Q
    
        for a in range(self.actions_num):
            Q[a] = self.Q_tilda(s, a)
        return Q
Ejemplo n.º 51
0
import json
import numpy as np
import cPickle as pickle

from nearpy import Engine
from nearpy.hashes import RandomBinaryProjections
from nearpy.storage import RedisStorage
from redis import Redis

dimension = 100

lshash = RandomBinaryProjections('DocHash', 12, rand_seed=123)

redis_storage = RedisStorage(Redis(host='localhost', port=6379, db=1))
engine = Engine(dimension, lshashes=[lshash], storage=redis_storage)

with open("ids.json") as f:
    ids = json.load(f)

docvecs = np.load("hndbow.docvecs.doctag_syn0.npy", mmap_mode='r')

for i,id in enumerate(ids):
    vec = docvecs[i] # 1x100 nparray
    engine.store_vector(vec, id)

redis_storage.store_hash_configuration(lshash)
Ejemplo n.º 52
0
def example2():

    # Dimension of feature space
    DIM = 100

    # Number of data points (dont do too much because of exact search)
    POINTS = 20000

    ##########################################################

    print 'Performing indexing with HashPermutations...'
    t0 = time.time()

    # Create permutations meta-hash
    permutations = HashPermutations('permut')

    # Create binary hash as child hash
    rbp_perm = RandomBinaryProjections('rbp_perm', 14)
    rbp_conf = {'num_permutation':50,'beam_size':10,'num_neighbour':100}

    # Add rbp as child hash of permutations hash
    permutations.add_child_hash(rbp_perm, rbp_conf)

    # Create engine
    engine_perm = Engine(DIM, lshashes=[permutations], distance=CosineDistance())

    # First index some random vectors
    matrix = numpy.zeros((POINTS,DIM))
    for i in xrange(POINTS):
        v = numpy.random.randn(DIM)
        matrix[i] = v
        engine_perm.store_vector(v)

    # Then update permuted index
    permutations.build_permuted_index()

    t1 = time.time()
    print 'Indexing took %f seconds' % (t1-t0)

    # Get random query vector
    query = numpy.random.randn(DIM)

    # Do random query on engine 3
    print '\nNeighbour distances with HashPermutations:'
    print '  -> Candidate count is %d' % engine_perm.candidate_count(query)
    results = engine_perm.neighbours(query)
    dists = [x[2] for x in results]
    print dists

    # Real neighbours
    print '\nReal neighbour distances:'
    query = query.reshape((1,DIM))
    dists = CosineDistance().distance_matrix(matrix,query)
    dists = dists.reshape((-1,))
    dists = sorted(dists)
    print dists[:10]

    ##########################################################

    print '\nPerforming indexing with HashPermutationMapper...'
    t0 = time.time()

    # Create permutations meta-hash
    permutations2 = HashPermutationMapper('permut2')

    # Create binary hash as child hash
    rbp_perm2 = RandomBinaryProjections('rbp_perm2', 14)

    # Add rbp as child hash of permutations hash
    permutations2.add_child_hash(rbp_perm2)

    # Create engine
    engine_perm2 = Engine(DIM, lshashes=[permutations2], distance=CosineDistance())

    # First index some random vectors
    matrix = numpy.zeros((POINTS,DIM))
    for i in xrange(POINTS):
        v = numpy.random.randn(DIM)
        matrix[i] = v
        engine_perm2.store_vector(v)

    t1 = time.time()
    print 'Indexing took %f seconds' % (t1-t0)

    # Get random query vector
    query = numpy.random.randn(DIM)

    # Do random query on engine 4
    print '\nNeighbour distances with HashPermutationMapper:'
    print '  -> Candidate count is %d' % engine_perm2.candidate_count(query)
    results = engine_perm2.neighbours(query)
    dists = [x[2] for x in results]
    print dists

    # Real neighbours
    print '\nReal neighbour distances:'
    query = query.reshape((1,DIM))
    dists = CosineDistance().distance_matrix(matrix,query)
    dists = dists.reshape((-1,))
    dists = sorted(dists)
    print dists[:10]

    ##########################################################

    print '\nPerforming indexing with mutliple binary hashes...'
    t0 = time.time()

    hashes = []
    for k in range(20):
        hashes.append(RandomBinaryProjections('rbp_%d' % k, 10))

    # Create engine
    engine_rbps = Engine(DIM, lshashes=hashes, distance=CosineDistance())

    # First index some random vectors
    matrix = numpy.zeros((POINTS,DIM))
    for i in xrange(POINTS):
        v = numpy.random.randn(DIM)
        matrix[i] = v
        engine_rbps.store_vector(v)

    t1 = time.time()
    print 'Indexing took %f seconds' % (t1-t0)

    # Get random query vector
    query = numpy.random.randn(DIM)

    # Do random query on engine 4
    print '\nNeighbour distances with mutliple binary hashes:'
    print '  -> Candidate count is %d' % engine_rbps.candidate_count(query)
    results = engine_rbps.neighbours(query)
    dists = [x[2] for x in results]
    print dists

    # Real neighbours
    print '\nReal neighbour distances:'
    query = query.reshape((1,DIM))
    dists = CosineDistance().distance_matrix(matrix,query)
    dists = dists.reshape((-1,))
    dists = sorted(dists)
    print dists[:10]
Ejemplo n.º 53
0
class LSHSearch:
    def __init__(self, feature_file, dimension, neighbour, lsh_project_num):
        self.feature_file = feature_file
        self.dimension = dimension
        self.neighbour = neighbour
        self.face_feature = defaultdict(str)
        self.ground_truth = defaultdict(int)

        # Create permutations meta-hash
        self.permutations2 = HashPermutationMapper('permut2')

        tmp_feature = defaultdict(str)
        with open(feature_file, 'rb') as f:
            reader = csv.reader(f, delimiter=' ')
            for name, feature in reader:
                tmp_feature[name] = feature

        matrix = []
        label = []
        for item in tmp_feature.keys():
            v = map(float, tmp_feature[item].split(','))
            matrix.append(np.array(v))
            label.append(item)
        random.shuffle(matrix)
        print 'PCA matric : ', len(matrix)

        rbp_perm2 = PCABinaryProjections(
            'testPCABPHash', lsh_project_num, matrix)
        self.permutations2.add_child_hash(rbp_perm2)

        # Create engine
        nearest = NearestFilter(self.neighbour)
        self.engine = Engine(
            self.dimension,
            lshashes=[self.permutations2],
            distance=CosineDistance(),
            vector_filters=[nearest])

    def build(self):
        with open(self.feature_file, 'rb') as f:
            reader = csv.reader(f, delimiter=' ')
            for name, feature in reader:
                self.face_feature[name] = feature
                person = '_'.join(name.split('_')[:-1])
                self.ground_truth[person] += 1

        for item in self.face_feature.keys():
            v = map(float, self.face_feature[item].split(','))
            self.engine.store_vector(v, item)

    def update(self, person, feature):
        print feature
        v = map(float, feature.split(','))
        epoch_time = long(time.time())
        f_name = person + '_' + str(epoch_time)
        print f_name
        self.engine.store_vector(v, f_name)

    def query(self, person_feature):
        dists = []
        scores = []

        query = map(float, person_feature.split(','))
        # print '\nNeighbour distances with mutliple binary hashes:'
        # print '  -> Candidate count is %d' % self.engine.candidate_count(query)
        results = self.engine.neighbours(query)
        dists = dists + [x[1] for x in results]
        scores = scores + [x[2] for x in results]

        res = zip(dists, scores)
        res.sort(key=lambda t: t[1])
        return res[:self.neighbour]
Ejemplo n.º 54
0
class TestRandomBinaryProjectionTree(unittest.TestCase):

    def setUp(self):
        self.memory = MemoryStorage()
        self.redis_object = Redis(host='localhost',
                                  port=6379, db=0)
        self.redis_storage = RedisStorage(self.redis_object)

    def test_retrieval(self):
        # We want 12 projections, 20 results at least
        rbpt = RandomBinaryProjectionTree('testHash', 12, 20)

        # Create engine for 100 dimensional feature space, do not forget to set
        # nearest filter to 20, because default is 10
        self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)])

        # First insert 200000 random vectors
        #print 'Indexing...'
        for k in range(200000):
            x = numpy.random.randn(100)
            x_data = 'data'
            self.engine.store_vector(x, x_data)

        # Now do random queries and check result set size
        #print 'Querying...'
        for k in range(10):
            x = numpy.random.randn(100)
            n = self.engine.neighbours(x)
            #print "Candidate count = %d" % self.engine.candidate_count(x)
            #print "Result size = %d" % len(n)
            self.assertEqual(len(n), 20)

    def test_storage_memory(self):
        # We want 10 projections, 20 results at least
        rbpt = RandomBinaryProjectionTree('testHash', 10, 20)

        # Create engine for 100 dimensional feature space
        self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)])

        # First insert 2000 random vectors
        for k in range(2000):
            x = numpy.random.randn(100)
            x_data = 'data'
            self.engine.store_vector(x, x_data)

        self.memory.store_hash_configuration(rbpt)

        rbpt2 = RandomBinaryProjectionTree(None, None, None)
        rbpt2.apply_config(self.memory.load_hash_configuration('testHash'))

        self.assertEqual(rbpt.dim, rbpt2.dim)
        self.assertEqual(rbpt.hash_name, rbpt2.hash_name)
        self.assertEqual(rbpt.projection_count, rbpt2.projection_count)

        for i in range(rbpt.normals.shape[0]):
            for j in range(rbpt.normals.shape[1]):
                self.assertEqual(rbpt.normals[i, j], rbpt2.normals[i, j])

        # Now do random queries and check result set size
        for k in range(10):
            x = numpy.random.randn(100)
            keys1 = rbpt.hash_vector(x, querying=True)
            keys2 = rbpt2.hash_vector(x, querying=True)
            self.assertEqual(len(keys1), len(keys2))
            for k in range(len(keys1)):
                self.assertEqual(keys1[k], keys2[k])

    def test_storage_redis(self):
        # We want 10 projections, 20 results at least
        rbpt = RandomBinaryProjectionTree('testHash', 10, 20)

        # Create engine for 100 dimensional feature space
        self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)])

        # First insert 2000 random vectors
        for k in range(2000):
            x = numpy.random.randn(100)
            x_data = 'data'
            self.engine.store_vector(x, x_data)


        self.redis_storage.store_hash_configuration(rbpt)

        rbpt2 = RandomBinaryProjectionTree(None, None, None)
        rbpt2.apply_config(self.redis_storage.load_hash_configuration('testHash'))

        self.assertEqual(rbpt.dim, rbpt2.dim)
        self.assertEqual(rbpt.hash_name, rbpt2.hash_name)
        self.assertEqual(rbpt.projection_count, rbpt2.projection_count)

        for i in range(rbpt.normals.shape[0]):
            for j in range(rbpt.normals.shape[1]):
                self.assertEqual(rbpt.normals[i, j], rbpt2.normals[i, j])

        # Now do random queries and check result set size
        for k in range(10):
            x = numpy.random.randn(100)
            keys1 = rbpt.hash_vector(x, querying=True)
            keys2 = rbpt2.hash_vector(x, querying=True)
            self.assertEqual(len(keys1), len(keys2))
            for k in range(len(keys1)):
                self.assertEqual(keys1[k], keys2[k])
Ejemplo n.º 55
0
import json
import numpy as np
import cPickle as pickle

from nearpy import Engine
from nearpy.hashes import RandomBinaryProjections
from nearpy.storage import RedisStorage
from redis import Redis

from ne import CosineSim

dimension = 100

with open("hndbow.index2word", 'r') as f:
    index2words = json.load(f)

wordvecs = np.load("hndbow.syn0.npy")

redis_storage = RedisStorage(Redis(host='localhost', port=6379, db=3))

lshash = RandomBinaryProjections('WordHash', 5, rand_seed=123)

engine = Engine(dimension, distance=CosineSim(), lshashes=[lshash], storage=redis_storage)

for i,w in enumerate(index2words):
    vec = wordvecs[i] # 1x100 nparray
    engine.store_vector(vec, w)

redis_storage.store_hash_configuration(lshash)