Exemple #1
0
    def test_hash_memory_storage_pcabp(self):
        train_vectors = numpy.random.randn(10, 100)
        hash1 = PCABinaryProjections('testPCABPHash', 4, train_vectors)

        self.memory.store_hash_configuration(hash1)

        hash2 = PCABinaryProjections(None, None, None)
        hash2.apply_config(self.memory.load_hash_configuration('testPCABPHash'))

        self.assertEqual(hash1.dim, hash2.dim)
        self.assertEqual(hash1.hash_name, hash2.hash_name)
        self.assertEqual(hash1.projection_count, hash2.projection_count)

        for i in range(hash1.components.shape[0]):
            for j in range(hash1.components.shape[1]):
                self.assertEqual(hash1.components[i, j], hash2.components[i, j])
    def test_hash_memory_storage_pcabp(self):
        train_vectors = numpy.random.randn(10, 100)
        hash1 = PCABinaryProjections('testPCABPHash', 4, train_vectors)

        self.memory.store_hash_configuration(hash1)

        hash2 = PCABinaryProjections(None, None, None)
        hash2.apply_config(self.memory.load_hash_configuration('testPCABPHash'))

        self.assertEqual(hash1.dim, hash2.dim)
        self.assertEqual(hash1.hash_name, hash2.hash_name)
        self.assertEqual(hash1.projection_count, hash2.projection_count)

        for i in range(hash1.components.shape[0]):
            for j in range(hash1.components.shape[1]):
                self.assertEqual(hash1.components[i, j], hash2.components[i, j])
Exemple #3
0
    def __init__(self, feature_file, dimension, neighbour, lsh_project_num):
        self.feature_file = feature_file
        self.dimension = dimension
        self.neighbour = neighbour
        self.face_feature = defaultdict(str)
        self.ground_truth = defaultdict(int)

        # Create permutations meta-hash
        permutations2 = HashPermutationMapper('permut2')

        tmp_feature = defaultdict(str)
        with open(feature_file, 'rb') as f:
            reader = csv.reader(f, delimiter=' ')
            for name, feature in reader:
                tmp_feature[name] = feature

        matrix = []
        label = []
        for item in tmp_feature.keys():
            v = map(float, tmp_feature[item].split(','))
            matrix.append(np.array(v))
            label.append(item)
        random.shuffle(matrix)
        print 'PCA matric : ', len(matrix)

        rbp_perm2 = PCABinaryProjections('testPCABPHash', lsh_project_num,
                                         matrix)
        permutations2.add_child_hash(rbp_perm2)

        # Create engine
        nearest = NearestFilter(self.neighbour)
        self.engine = Engine(self.dimension,
                             lshashes=[permutations2],
                             distance=CosineDistance(),
                             vector_filters=[nearest])
Exemple #4
0
class TestPCABinaryProjections(unittest.TestCase):

    def setUp(self):
        self.vectors = numpy.random.randn(10, 100)
        self.pbp = PCABinaryProjections('pbp', 4, self.vectors)

    def test_hash_format(self):
        h = self.pbp.hash_vector(numpy.random.randn(10))
        self.assertEqual(len(h), 1)
        self.assertEqual(type(h[0]), type(''))
        self.assertEqual(len(h[0]), 4)
        for c in h[0]:
            self.assertTrue(c == '1' or c == '0')

    def test_hash_deterministic(self):
        x = numpy.random.randn(10)
        first_hash = self.pbp.hash_vector(x)[0]
        for k in range(100):
            self.assertEqual(first_hash, self.pbp.hash_vector(x)[0])
 def fit(self, X, y=None, hash="randbinary"):
     X = np.array(X)
     assert len(X.shape) == 2, "X not 2-rank"
     dimension = X.shape[-1]
     if hash == "randbinary":
         rbp = RandomBinaryProjections('rbp', 10)
     elif hash == "pcabinary":
         rbp = PCABinaryProjections('rbp', 10, training_set=X)
     self.engine = Engine(dimension, lshashes=[rbp])
     index = 0
     for x in X:
         self.engine.store_vector(x, str(index))
         index += 1
Exemple #6
0
 def get_engine(self, vocab, vecs):
     logging.info('{} hash functions'.format(self.args.projections))
     hashes = [
         PCABinaryProjections('ne1v', self.args.projections,
                              vecs[:1000, :].T)
     ]
     engine = Engine(vecs.shape[1],
                     lshashes=hashes,
                     distance=[],
                     vector_filters=[])
     for ind, vec in enumerate(vecs):
         if not ind % 100000:
             logging.info('{} words added to nearpy engine'.format(ind))
         engine.store_vector(vec, ind)
     return engine
Exemple #7
0
 def get_nearpy_engine(self):
     """
     Populates the nearpy engine. Note that the instanciation of the PCA
     hash means a PCA of 1000 target vectors and may consume much memory.
     """
     logging.info("Creating nearpy engine...")
     hashes = [
         PCABinaryProjections("ne1v", self.args.n_proj,
                              self.tg_model.syn0[:1000, :].T)
     ]
     logging.info(hashes)
     dim = self.tg_model.layer1_size
     self.engine = Engine(dim,
                          lshashes=hashes,
                          vector_filters=[],
                          distance=[])
     for ind in xrange(self.tg_model.syn0.shape[0]):
         if not ind % 200000:
             logging.debug(
                 "{} target words added to nearpy engine".format(ind))
         self.engine.store_vector(self.tg_model.syn0[ind, :], ind)
Exemple #8
0
 def setUp(self):
     self.vectors = numpy.random.randn(10, 100)
     self.pbp = PCABinaryProjections('pbp', 4, self.vectors)
Exemple #9
0
 def setUp(self):
     self.vectors = numpy.random.randn(10, 100)
     self.pbp = PCABinaryProjections('pbp', 4, self.vectors)