def test_hash_memory_storage_pcabp(self): train_vectors = numpy.random.randn(10, 100) hash1 = PCABinaryProjections('testPCABPHash', 4, train_vectors) self.memory.store_hash_configuration(hash1) hash2 = PCABinaryProjections(None, None, None) hash2.apply_config(self.memory.load_hash_configuration('testPCABPHash')) self.assertEqual(hash1.dim, hash2.dim) self.assertEqual(hash1.hash_name, hash2.hash_name) self.assertEqual(hash1.projection_count, hash2.projection_count) for i in range(hash1.components.shape[0]): for j in range(hash1.components.shape[1]): self.assertEqual(hash1.components[i, j], hash2.components[i, j])
def __init__(self, feature_file, dimension, neighbour, lsh_project_num): self.feature_file = feature_file self.dimension = dimension self.neighbour = neighbour self.face_feature = defaultdict(str) self.ground_truth = defaultdict(int) # Create permutations meta-hash permutations2 = HashPermutationMapper('permut2') tmp_feature = defaultdict(str) with open(feature_file, 'rb') as f: reader = csv.reader(f, delimiter=' ') for name, feature in reader: tmp_feature[name] = feature matrix = [] label = [] for item in tmp_feature.keys(): v = map(float, tmp_feature[item].split(',')) matrix.append(np.array(v)) label.append(item) random.shuffle(matrix) print 'PCA matric : ', len(matrix) rbp_perm2 = PCABinaryProjections('testPCABPHash', lsh_project_num, matrix) permutations2.add_child_hash(rbp_perm2) # Create engine nearest = NearestFilter(self.neighbour) self.engine = Engine(self.dimension, lshashes=[permutations2], distance=CosineDistance(), vector_filters=[nearest])
class TestPCABinaryProjections(unittest.TestCase): def setUp(self): self.vectors = numpy.random.randn(10, 100) self.pbp = PCABinaryProjections('pbp', 4, self.vectors) def test_hash_format(self): h = self.pbp.hash_vector(numpy.random.randn(10)) self.assertEqual(len(h), 1) self.assertEqual(type(h[0]), type('')) self.assertEqual(len(h[0]), 4) for c in h[0]: self.assertTrue(c == '1' or c == '0') def test_hash_deterministic(self): x = numpy.random.randn(10) first_hash = self.pbp.hash_vector(x)[0] for k in range(100): self.assertEqual(first_hash, self.pbp.hash_vector(x)[0])
def fit(self, X, y=None, hash="randbinary"): X = np.array(X) assert len(X.shape) == 2, "X not 2-rank" dimension = X.shape[-1] if hash == "randbinary": rbp = RandomBinaryProjections('rbp', 10) elif hash == "pcabinary": rbp = PCABinaryProjections('rbp', 10, training_set=X) self.engine = Engine(dimension, lshashes=[rbp]) index = 0 for x in X: self.engine.store_vector(x, str(index)) index += 1
def get_engine(self, vocab, vecs): logging.info('{} hash functions'.format(self.args.projections)) hashes = [ PCABinaryProjections('ne1v', self.args.projections, vecs[:1000, :].T) ] engine = Engine(vecs.shape[1], lshashes=hashes, distance=[], vector_filters=[]) for ind, vec in enumerate(vecs): if not ind % 100000: logging.info('{} words added to nearpy engine'.format(ind)) engine.store_vector(vec, ind) return engine
def get_nearpy_engine(self): """ Populates the nearpy engine. Note that the instanciation of the PCA hash means a PCA of 1000 target vectors and may consume much memory. """ logging.info("Creating nearpy engine...") hashes = [ PCABinaryProjections("ne1v", self.args.n_proj, self.tg_model.syn0[:1000, :].T) ] logging.info(hashes) dim = self.tg_model.layer1_size self.engine = Engine(dim, lshashes=hashes, vector_filters=[], distance=[]) for ind in xrange(self.tg_model.syn0.shape[0]): if not ind % 200000: logging.debug( "{} target words added to nearpy engine".format(ind)) self.engine.store_vector(self.tg_model.syn0[ind, :], ind)
def setUp(self): self.vectors = numpy.random.randn(10, 100) self.pbp = PCABinaryProjections('pbp', 4, self.vectors)