def setUp(self):
        self.V = []
        self.V.append((numpy.array([0]), 'data1', 0.4))
        self.V.append((numpy.array([1]), 'data2', 0.9))
        self.V.append((numpy.array([2]), 'data3', 1.4))
        self.V.append((numpy.array([3]), 'data4', 2.1))
        self.V.append((numpy.array([4]), 'data5', 0.1))
        self.V.append((numpy.array([5]), 'data6', 8.7))
        self.V.append((numpy.array([6]), 'data7', 3.4))
        self.V.append((numpy.array([7]), 'data8', 2.8))

        self.threshold_filter = DistanceThresholdFilter(1.0)
        self.nearest_filter = NearestFilter(5)
        self.unique = UniqueFilter()
def test_nearpy(X_train, y_train, X_test, k):
    # We are looking for the k closest neighbours
    nearest = NearestFilter(k)
    X_train_normalized = []
    for i in range(len(X_train)):
        train_example = X_train[i]
        element = ((train_example / np.linalg.norm(train_example)).tolist(),
                   y_train[i].tolist())
        X_train_normalized.append(element)

    engine = Engine(X_train.shape[1],
                    lshashes=[RandomBinaryProjections('default', 10)],
                    distance=CosineDistance(),
                    vector_filters=[nearest])

    #perform hashing for train examples
    for train_example in X_train:
        engine.store_vector(train_example)

    labels = []
    for test_example in X_test:
        neighbors = engine.neighbours(test_example)
        labels.append([
            train_example[1] for train_example in X_train_normalized
            if set(neighbors[0][0]) == set(train_example[0])
        ])
    return labels
Exemple #3
0
    def __init__(self, feature_file, dimension, neighbour, lsh_project_num):
        self.feature_file = feature_file
        self.dimension = dimension
        self.neighbour = neighbour
        self.face_feature = defaultdict(str)
        self.ground_truth = defaultdict(int)

        # Create permutations meta-hash
        permutations2 = HashPermutationMapper('permut2')

        tmp_feature = defaultdict(str)
        with open(feature_file, 'rb') as f:
            reader = csv.reader(f, delimiter=' ')
            for name, feature in reader:
                tmp_feature[name] = feature

        matrix = []
        label = []
        for item in tmp_feature.keys():
            v = map(float, tmp_feature[item].split(','))
            matrix.append(np.array(v))
            label.append(item)
        random.shuffle(matrix)
        print 'PCA matric : ', len(matrix)

        rbp_perm2 = PCABinaryProjections('testPCABPHash', lsh_project_num,
                                         matrix)
        permutations2.add_child_hash(rbp_perm2)

        # Create engine
        nearest = NearestFilter(self.neighbour)
        self.engine = Engine(self.dimension,
                             lshashes=[permutations2],
                             distance=CosineDistance(),
                             vector_filters=[nearest])
Exemple #4
0
	def loadHashmap(self, feature_size=129, result_n=1000):  #这里参数没有用到
		'''
		feature_size: hash空间维数大小
		result_n :返回多少个最近邻
		'''
		# Create redis storage adapter
		redis_object = Redis(host='localhost', port=6379, db=0)
		redis_storage = RedisStorage(redis_object)
		try:
			# Get hash config from redis
			config = redis_storage.load_hash_configuration('test')
			# Config is existing, create hash with None parameters
			lshash = RandomBinaryProjections(None, None)
			# Apply configuration loaded from redis
			lshash.apply_config(config)
			
		except:
			# Config is not existing, create hash from scratch, with 10 projections
			lshash = RandomBinaryProjections('test', 0)
			

		# Create engine for feature space of 100 dimensions and use our hash.
		# This will set the dimension of the lshash only the first time, not when
		# using the configuration loaded from redis. Use redis storage to store
		# buckets.
		nearest = NearestFilter(result_n)
		#self.engine = Engine(feature_size, lshashes=[], vector_filters=[])
		self.engine = Engine(feature_size, lshashes=[lshash], vector_filters=[nearest], storage=redis_storage, distance=EuclideanDistance())

		# Do some stuff like indexing or querying with the engine...

		# Finally store hash configuration in redis for later use
		redis_storage.store_hash_configuration(lshash)
Exemple #5
0
    def __init__(self,
                 dim,
                 lshashes=None,
                 distance=None,
                 fetch_vector_filters=None,
                 vector_filters=None,
                 storage=None):
        """ Keeps the configuration. """
        if lshashes is None:
            lshashes = [RandomBinaryProjections('default', 10)]
        self.lshashes = lshashes
        if distance is None: distance = EuclideanDistance()
        self.distance = distance
        if vector_filters is None: vector_filters = [NearestFilter(10)]
        self.vector_filters = vector_filters
        if fetch_vector_filters is None:
            fetch_vector_filters = [UniqueFilter()]
        self.fetch_vector_filters = fetch_vector_filters
        if storage is None: storage = MemoryStorage()
        self.storage = storage

        # Initialize all hashes for the data space dimension.
        for lshash in self.lshashes:
            lshash.reset(dim)

        print('*** engine init done ***')
Exemple #6
0
    def load_hashmap(self):
        # Create redis storage adapter
        # need to start redis service
        redis_object = Redis(host='localhost', port=6379, db=14)
        redis_storage = RedisStorage(redis_object)
        try:
            config = redis_storage.load_hash_configuration('test')
            lshash = RandomBinaryProjections(None, None)
            lshash.apply_config(config)

        except:
            # Config is not existing, create hash from scratch, with 10 projections
            lshash = RandomBinaryProjections('test', 10)

        nearest = NearestFilter(self.nn)
        # self.engine = Engine(feature_size, lshashes=[], vector_filters=[])
        self.engine = Engine(self.feature_size,
                             lshashes=[lshash],
                             vector_filters=[nearest],
                             storage=redis_storage,
                             distance=CosineDistance())

        # Do some stuff like indexing or querying with the engine...

        # Finally store hash configuration in redis for later use
        redis_storage.store_hash_configuration(lshash)
Exemple #7
0
    def test_random_discretized_projections(self):
        dim = 4
        vector_count = 5000
        vectors = numpy.random.randn(dim, vector_count)

        # First get recall and precision for one 1-dim random hash
        rdp = RandomDiscretizedProjections('rdp', 1, 0.01)
        nearest = NearestFilter(10)
        engine = Engine(dim, lshashes=[rdp], vector_filters=[nearest])
        exp = RecallPrecisionExperiment(10, vectors)
        result = exp.perform_experiment([engine])

        recall1 = result[0][0]
        precision1 = result[0][1]
        searchtime1 = result[0][2]

        print('\nRecall RDP: %f, Precision RDP: %f, SearchTime RDP: %f\n' % \
            (recall1, precision1, searchtime1))

        # Then get recall and precision for one 4-dim random hash
        rdp = RandomDiscretizedProjections('rdp', 2, 0.2)
        engine = Engine(dim, lshashes=[rdp], vector_filters=[nearest])
        result = exp.perform_experiment([engine])

        recall2 = result[0][0]
        precision2 = result[0][1]
        searchtime2 = result[0][2]

        print('\nRecall RDP: %f, Precision RDP: %f, SearchTime RDP: %f\n' % \
            (recall2, precision2, searchtime2))

        # Many things are random here, but the precision should increase
        # with dimension
        self.assertTrue(precision2 > precision1)
Exemple #8
0
 def __init__(self, metric, n_bits, hash_counts):
     self._n_bits = n_bits
     self._hash_counts = hash_counts
     self._metric = metric
     self._filter = NearestFilter(10)
     self.name = 'NearPy(n_bits=%d, hash_counts=%d)' % (self._n_bits,
                                                        self._hash_counts)
Exemple #9
0
def knn(data, k):
    assert k <= len(
        data
    ) - 1, 'The number of neighbors must be smaller than the data cardinality (minus one)'
    k = k + 1
    n, dimension = data.shape
    ind = []
    dist = []

    if (dimension < 10):
        rbp = RandomBinaryProjections('rbp', dimension)
    else:
        rbp = RandomBinaryProjections('rbp', 10)

    engine = Engine(dimension,
                    lshashes=[rbp],
                    vector_filters=[NearestFilter(k)])

    for i in range(n):
        engine.store_vector(data[i], i)

    for i in range(n):

        N = engine.neighbours(data[i])
        ind.append([x[1] for x in N][1:])
        dist.append([x[2] for x in N][1:])

    return N, dist, ind
    def test_retrieval(self):
        # We want 12 projections, 20 results at least
        rbpt = RandomBinaryProjectionTree('testHash', 12, 20)

        # Create engine for 100 dimensional feature space, do not forget to set
        # nearest filter to 20, because default is 10
        self.engine = Engine(100,
                             lshashes=[rbpt],
                             vector_filters=[NearestFilter(20)])

        # First insert 200000 random vectors
        #print 'Indexing...'
        for k in range(200000):
            x = numpy.random.randn(100)
            x_data = 'data'
            self.engine.store_vector(x, x_data)

        # Now do random queries and check result set size
        #print 'Querying...'
        for k in range(10):
            x = numpy.random.randn(100)
            n = self.engine.neighbours(x)
            #print "Candidate count = %d" % self.engine.candidate_count(x)
            #print "Result size = %d" % len(n)
            self.assertEqual(len(n), 20)
Exemple #11
0
def main(args):
    """ Main entry.
    """

    data = Dataset(args.dataset)
    num, dim = data.base.shape

    # We are looking for the ten closest neighbours
    nearest = NearestFilter(args.topk)
    # We want unique candidates
    unique = UniqueFilter()

    # Create engines for all configurations
    for nbit, ntbl in itertools.product(args.nbits, args.ntbls):
        logging.info("Creating Engine ...")
        lshashes = [RandomBinaryProjections('rbp%d' % i, nbit)
                    for i in xrange(ntbl)]

        # Create engine with this configuration
        engine = Engine(dim, lshashes=lshashes,
                        vector_filters=[unique, nearest])
        logging.info("\tDone!")

        logging.info("Adding items ...")
        for i in xrange(num):
            engine.store_vector(data.base[i, :], i)
            if i % 100000 == 0:
                logging.info("\t%d/%d" % (i, data.nbae))
        logging.info("\tDone!")

        ids = np.zeros((data.nqry, args.topk), np.int)
        logging.info("Searching ...")
        tic()
        for i in xrange(data.nqry):
            reti = [y for x, y, z in
                    np.array(engine.neighbours(data.query[i]))]
            ids[i, :len(reti)] = reti
            if i % 100 == 0:
                logging.info("\t%d/%d" % (i, data.nqry))
        time_costs = toc()
        logging.info("\tDone!")

        report = os.path.join(args.exp_dir, "report.txt")
        with open(report, "a") as rptf:
            rptf.write("*" * 64 + "\n")
            rptf.write("* %s\n" % time.asctime())
            rptf.write("*" * 64 + "\n")

        r_at_k = compute_stats(data.groundtruth, ids, args.topk)[-1][-1]

        with open(report, "a") as rptf:
            rptf.write("=" * 64 + "\n")
            rptf.write("index_%s-nbit_%d-ntbl_%d\n" % ("NearPy", nbit, ntbl))
            rptf.write("-" * 64 + "\n")
            rptf.write("recall@%-8d%.4f\n" % (args.topk, r_at_k))
            rptf.write("time cost (ms): %.3f\n" %
                       (time_costs * 1000 / data.nqry))
Exemple #12
0
def createLSH(dimensions):
    nearest = NearestFilter(5)
    bin_width = 10
    projections = 50
    rbp = RandomDiscretizedProjections('rbp', projections, bin_width)
    rbp2 = RandomDiscretizedProjections('rbp2', projections, bin_width)
    rbp3 = RandomDiscretizedProjections('rbp3', projections, bin_width)
    rbp4 = RandomDiscretizedProjections('rbp4', projections, bin_width)

    engine = Engine(dimensions, lshashes=[rbp, rbp2, rbp3, rbp4], vector_filters=[nearest])
    return engine
Exemple #13
0
    def _create_engine(self, k, lshashes=None):
        self.k_ = k
        self.engine_ = Engine(self.dimension_,
                              lshashes,
                              distance=self.dist_metric_,
                              vector_filters=[NearestFilter(k)])

        for i, feature in enumerate(self.featurized_):
            if self.transpose_:
                self.engine_.store_vector(feature.T, i)
            else:
                self.engine_.store_vector(feature, i)
Exemple #14
0
    def build(self, data, k, cp):
        n_items, vector_length = data.shape
        #print(data.shape)
        #parameters init
        method_param = init_method_param("nearpy", data=data, cp=cp)
        hash_counts = method_param["hash_counts"]
        n_bits = method_param["n_bits"]

        self.filter = NearestFilter(10)

        hashes = []
        for k in range(hash_counts):
            nearpy_rbp = nearpy.hashes.RandomBinaryProjections(
                'rbp_%d' % k, n_bits)
            hashes.append(nearpy_rbp)

        if self.metric == 'euclidean':
            dist = nearpy.distances.EuclideanDistance()
            self.index = nearpy.Engine(
                vector_length,
                lshashes=hashes,
                distance=dist,
                vector_filters=[self.filter])
        else:  # Default (angular) = Cosine distance
            self.index = nearpy.Engine(
                vector_length,
                lshashes=hashes,
                vector_filters=[self.filter])

        #if self.metric == 'angular':
            #data = sklearn.preprocessing.normalize(data, axis=1, norm='l2')
        for i, x in enumerate(data):
            self.index.store_vector(x, i)

        # def query_train(self, data, k):
        self.filter.N = k
        #if self.metric == 'angular':
            #data = sklearn.preprocessing.normalize([data], axis=1, norm='l2')[0]

        neighbors = np.empty((data.shape[0],k), dtype=int)
        distances = np.empty((data.shape[0],k))

        for i in range(len(data)):
            item_single = self.index.neighbours(data[i])
            dp_n = []
            dp_d = []
            for j in range(len(item_single)):
                dp_n.append(item_single[j][1])
                dp_d.append(item_single[j][2])
            neighbors[i] = np.asarray(dp_n)
            distances[i] = np.asarray(dp_d)

        return neighbors, distances
Exemple #15
0
    def test_experiment_with_unibucket_1(self):
        dim = 50
        vector_count = 100
        vectors = numpy.random.randn(dim, vector_count)
        unibucket = UniBucket('testHash')
        nearest = NearestFilter(10)
        engine = Engine(dim, lshashes=[unibucket], vector_filters=[nearest])
        exp = RecallPrecisionExperiment(10, vectors)
        result = exp.perform_experiment([engine])

        # Both recall and precision must be one in this case
        self.assertEqual(result[0][0], 1.0)
        self.assertEqual(result[0][1], 1.0)
Exemple #16
0
    def setUp(self):
        self.V = []
        self.V.append((numpy.array([0]), 'data1', 0.4))
        self.V.append((numpy.array([1]), 'data2', 0.9))
        self.V.append((numpy.array([2]), 'data3', 1.4))
        self.V.append((numpy.array([3]), 'data4', 2.1))
        self.V.append((numpy.array([4]), 'data5', 0.1))
        self.V.append((numpy.array([5]), 'data6', 8.7))
        self.V.append((numpy.array([6]), 'data7', 3.4))
        self.V.append((numpy.array([7]), 'data8', 2.8))

        self.threshold_filter = DistanceThresholdFilter(1.0)
        self.nearest_filter = NearestFilter(5)
        self.unique = UniqueFilter()
Exemple #17
0
    def test_experiment_with_unibucket_3(self):
        dim = 50
        vector_count = 100
        vectors = numpy.random.randn(dim, vector_count)
        unibucket = UniBucket('testHash')
        nearest = NearestFilter(5)
        engine = Engine(dim, lshashes=[unibucket], vector_filters=[nearest])
        exp = RecallPrecisionExperiment(10, vectors)
        result = exp.perform_experiment([engine])

        # In this case recall is only 0.5
        # because the engine returns 5 nearest, but
        # the experiment looks for 10 nearest.
        self.assertEqual(result[0][0], 0.5)
        self.assertEqual(result[0][1], 1.0)
Exemple #18
0
    def __init__(self,
                 dim,
                 lshashes=[RandomBinaryProjections('default', 10)],
                 distance=EuclideanDistance(),
                 vector_filters=[NearestFilter(10)],
                 storage=MemoryStorage()):
        """ Keeps the configuration. """
        self.lshashes = lshashes
        self.distance = distance
        self.vector_filters = vector_filters
        self.storage = storage

        # Initialize all hashes for the data space dimension.
        for lshash in self.lshashes:
            lshash.reset(dim)
Exemple #19
0
    def test_experiment_with_list_2(self):
        dim = 50
        vector_count = 100
        vectors = []
        for index in range(vector_count):
            vectors.append(numpy.random.randn(dim))
        unibucket = UniBucket('testHash')
        nearest = NearestFilter(10)
        engine = Engine(dim, lshashes=[unibucket], vector_filters=[nearest])
        exp = RecallPrecisionExperiment(5, vectors)
        result = exp.perform_experiment([engine])

        # In this case precision is only 0.5
        # because the engine returns 10 nearest, but
        # the experiment only looks for 5 nearest.
        self.assertEqual(result[0][0], 1.0)
        self.assertEqual(result[0][1], 0.5)
Exemple #20
0
def lshSearch(dataBase_, query_, k):
    featureNum_ = len(dataBase_)
    dimension_ = len(dataBase_[0])

    rbp_ = RandomBinaryProjections('rbp', 30)

    engine_ = Engine(dimension_,
                     lshashes=[rbp_],
                     vector_filters=[NearestFilter(k)])

    for i in range(featureNum_):
        v_ = dataBase_[i]
        engine_.store_vector(v_, '{}'.format(i))

    N_ = engine_.neighbours(query_, distance='euclidean')
    index_ = [int(x[1]) for x in N_]
    return index_
Exemple #21
0
    def test_random_binary_projections(self):
        dim = 4
        vector_count = 5000
        vectors = numpy.random.randn(dim, vector_count)

        # First get recall and precision for one 1-dim random hash
        rbp = RandomBinaryProjections('rbp', 32)
        nearest = NearestFilter(10)
        engine = Engine(dim, lshashes=[rbp], vector_filters=[nearest])
        exp = RecallPrecisionExperiment(10, vectors)
        result = exp.perform_experiment([engine])

        recall1 = result[0][0]
        precision1 = result[0][1]
        searchtime1 = result[0][2]

        print('\nRecall RBP: %f, Precision RBP: %f, SearchTime RBP: %f\n' % \
            (recall1, precision1, searchtime1))
Exemple #22
0
class TestVectorFilters(unittest.TestCase):

    def setUp(self):
        self.V = []
        self.V.append((numpy.array([0]), 'data1', 0.4))
        self.V.append((numpy.array([1]), 'data2', 0.9))
        self.V.append((numpy.array([2]), 'data3', 1.4))
        self.V.append((numpy.array([3]), 'data4', 2.1))
        self.V.append((numpy.array([4]), 'data5', 0.1))
        self.V.append((numpy.array([5]), 'data6', 8.7))
        self.V.append((numpy.array([6]), 'data7', 3.4))
        self.V.append((numpy.array([7]), 'data8', 2.8))

        self.threshold_filter = DistanceThresholdFilter(1.0)
        self.nearest_filter = NearestFilter(5)
        self.unique = UniqueFilter()

    def test_thresholding(self):
        result = self.threshold_filter.filter_vectors(self.V)
        self.assertEqual(len(result), 3)
        self.assertIn(self.V[0], result)
        self.assertIn(self.V[1], result)
        self.assertIn(self.V[4], result)

    def test_nearest(self):
        result = self.nearest_filter.filter_vectors(self.V)
        self.assertEqual(len(result), 5)
        self.assertIn(self.V[0], result)
        self.assertIn(self.V[1], result)
        self.assertIn(self.V[4], result)
        self.assertIn(self.V[2], result)
        self.assertIn(self.V[3], result)

    def test_unique(self):
        W = self.V
        W.append((numpy.array([7]), 'data8', 2.8))
        W.append((numpy.array([0]), 'data1', 2.8))
        W.append((numpy.array([1]), 'data2', 2.8))
        W.append((numpy.array([6]), 'data7', 2.8))

        result = self.unique.filter_vectors(W)
        self.assertEqual(len(result), 8)
Exemple #23
0
class TestVectorFilters(unittest.TestCase):

    def setUp(self):
        self.V = []
        self.V.append((numpy.array([0]), 'data1', 0.4))
        self.V.append((numpy.array([1]), 'data2', 0.9))
        self.V.append((numpy.array([2]), 'data3', 1.4))
        self.V.append((numpy.array([3]), 'data4', 2.1))
        self.V.append((numpy.array([4]), 'data5', 0.1))
        self.V.append((numpy.array([5]), 'data6', 8.7))
        self.V.append((numpy.array([6]), 'data7', 3.4))
        self.V.append((numpy.array([7]), 'data8', 2.8))

        self.threshold_filter = DistanceThresholdFilter(1.0)
        self.nearest_filter = NearestFilter(5)
        self.unique = UniqueFilter()

    def test_thresholding(self):
        result = self.threshold_filter.filter_vectors(self.V)
        self.assertEqual(len(result), 3)
        self.assertTrue(self.V[0] in result)
        self.assertTrue(self.V[1] in result)
        self.assertTrue(self.V[4] in result)

    def test_nearest(self):
        result = self.nearest_filter.filter_vectors(self.V)
        self.assertEqual(len(result), 5)
        self.assertTrue(self.V[0] in result)
        self.assertTrue(self.V[1] in result)
        self.assertTrue(self.V[4] in result)
        self.assertTrue(self.V[2] in result)
        self.assertTrue(self.V[3] in result)

    def test_unique(self):
        W = self.V
        W.append((numpy.array([7]), 'data8', 2.8))
        W.append((numpy.array([0]), 'data1', 2.8))
        W.append((numpy.array([1]), 'data2', 2.8))
        W.append((numpy.array([6]), 'data7', 2.8))

        result = self.unique.filter_vectors(W)
        self.assertEqual(len(result), 8)
    def test_storage_redis(self):
        # We want 10 projections, 20 results at least
        rbpt = RandomBinaryProjectionTree('testHash', 10, 20)

        # Create engine for 100 dimensional feature space
        self.engine = Engine(100,
                             lshashes=[rbpt],
                             vector_filters=[NearestFilter(20)])

        # First insert 2000 random vectors
        for k in range(2000):
            x = numpy.random.randn(100)
            x_data = 'data'
            self.engine.store_vector(x, x_data)

        self.redis_storage.store_hash_configuration(rbpt)

        rbpt2 = RandomBinaryProjectionTree(None, None, None)
        rbpt2.apply_config(
            self.redis_storage.load_hash_configuration('testHash'))

        self.assertEqual(rbpt.dim, rbpt2.dim)
        self.assertEqual(rbpt.hash_name, rbpt2.hash_name)
        self.assertEqual(rbpt.projection_count, rbpt2.projection_count)

        for i in range(rbpt.normals.shape[0]):
            for j in range(rbpt.normals.shape[1]):
                self.assertEqual(rbpt.normals[i, j], rbpt2.normals[i, j])

        # Now do random queries and check result set size
        for k in range(10):
            x = numpy.random.randn(100)
            keys1 = rbpt.hash_vector(x, querying=True)
            keys2 = rbpt2.hash_vector(x, querying=True)
            self.assertEqual(len(keys1), len(keys2))
            for k in range(len(keys1)):
                self.assertEqual(keys1[k], keys2[k])
Exemple #25
0
    def __init__(self,
                 matrix,
                 max_neighbours=20,
                 lshashes=[RandomBinaryProjections("rbp", 10)],
                 vector_filters=[UniqueFilter()],
                 distance=Pearson()):

        if not isinstance(lshashes, list):

            raise TypeError("'lshashes' must be an instance of 'list'")

        if not isinstance(vector_filters, list):

            raise TypeError("'vector_filters' must be an instance of 'list'")

        self.underlying = Engine(len(matrix[0]),
                                 lshashes=lshashes,
                                 vector_filters=vector_filters +
                                 [NearestFilter(max_neighbours)],
                                 distance=distance)

        for vector in matrix:

            self.underlying.store_vector(vector)
Exemple #26
0
print('MAE:', sum(err) / len(err))
print('RMSE', sqrt(sum([num**2 for num in err]) / len(err)))
end_time = time()
print('运行时间:', end_time - begin_time)

print('*' * 50)

# LSH搜索回归
print('LSH搜索方案:')
rbp = RandomBinaryProjections('rbp', 20)
engine1 = Engine(dimension - 1,
                 lshashes=[rbp, rbp, rbp],
                 storage=MemoryStorage(),
                 distance=EuclideanDistance(),
                 vector_filters=[NearestFilter(100)])

engine1.store_many_vectors(dataBase, [i for i in range(featureNum)])

begin_time = time()
print('        预测值      误差')
err = []
for m in range(len(queryBase)):
    query = queryBase[m]
    N = engine1.neighbours(query,
                           distance='euclidean',
                           fetch_vector_filters=[UniqueFilter()])
    index = [int(x[1]) for x in N]
    # print(index)
    data = np.array([dataBaseInitial.iloc[index, :]])
    data = data[0]
Exemple #27
0
    dimensions = 300

filename = os.getenv('VECTORS_FILE', 'glove.6B/glove.6B.300d.small.txt')

print('loading vectors')

lines = open(filename).read().strip().split('\n')

word_vectors = {}
for line in lines:
    split_line = line.split()
    word = split_line[0]
    vec = array([float(thing) for thing in split_line[1:]])
    word_vectors[word] = vec

print('starting engine')

nearest = Engine(dimensions,
                 distance=ManhattanDistance(),
                 vector_filters=[NearestFilter(20)],
                 lshashes=[RandomBinaryProjections('rbp', 2, rand_seed=42)])

for word, vec in word_vectors.items():
    nearest.store_vector(vec, word)

print('ready')


def query(array):
    return [res[1] for res in nearest.neighbours(array)]
Exemple #28
0
    def train(self, config):
        train_dataset = ImageDataset(config['dataset_path'], 'seen',
                                     config['data_augmentation_suffixes'],
                                     config['allow_different_views'])
        train_dataset.prepare(config['num_train_pairs'])

        val_dataset = ImageDataset(config['dataset_path'], 'test')
        val_dataset.prepare(config['num_val_pairs'])

        train_generator = DataGenerator(
            train_dataset,
            batch_size=config['batch_size'],
            dim=self.config['input_shape'],
            shuffle=config['shuffle_training_inputs'],
            dataset_type=config['dataset_type'])
        val_generator = DataGenerator(
            val_dataset,
            batch_size=config['batch_size'],
            dim=self.config['input_shape'],
            shuffle=config['shuffle_training_inputs'],
            dataset_type=config['dataset_type'])

        model_path, _ = os.path.split(self.config['model_filename'])
        callbacks = [
            keras.callbacks.TensorBoard(log_dir=self.log_dir,
                                        histogram_freq=0,
                                        write_graph=True,
                                        write_images=False),
            keras.callbacks.ModelCheckpoint(self.checkpoint_path,
                                            verbose=0,
                                            save_weights_only=True)
        ]

        self.keras_model.compile(
            loss=utils.contrastive_loss,
            optimizer=Adam(lr=config['learning_rate']),
            metrics=[utils.accuracy, utils.auc_roc, 'acc'])

        history = self.keras_model.fit_generator(
            generator=train_generator,
            validation_data=val_generator,
            epochs=config['epochs'],
            use_multiprocessing=True,
            callbacks=callbacks,
            workers=multiprocessing.cpu_count())

        self.keras_model.save(self.config['model_filename'])
        #-------------------------------------------------------

        #make new dataset
        seen_dataset = ImageDataset(config['dataset_path'], 'seen')
        seen = []

        #only add imgs that are returned by nearpy

        new = []
        for i in range(5):
            new += [
                os.path.join(x, 'view_00000{}'.format(i)) for x in os.listdir(
                    os.path.join(config['dataset_path'], 'test'))
            ]
        pred_arr = []
        dimension = 9984
        engine = Engine(dimension, vector_filters=[NearestFilter(5)])
        #for i in range(0, iter):
        #seen = list(seen_dataset._class_labels)
        seen = []
        for i in range(5):
            seen += [
                os.path.join(x, 'view_00000{}'.format(i)) for x in os.listdir(
                    os.path.join(config['dataset_path'], 'seen'))
            ]
        for class1 in seen:
            folder1 = os.path.join(
                os.path.join(config['dataset_path'], 'seen'), class1)
            for obj in os.listdir(folder1):
                im2 = os.path.join(folder1, obj)
                image = np.load(im2)
                engine.store_vector(image['arr_0'], class1)
        for img in new:
            #nea rpy stuff
            folder1 = os.path.join(
                os.path.join(config['dataset_path'], 'test'), img)
            im = os.path.join(folder1, os.listdir(folder1)[0])
            image = np.load(im)['arr_0']
            neighbors = engine.neighbours(image)
            for n in neighbors:
                folder1 = os.path.join(
                    os.path.join(config['dataset_path'], 'seen'), n[1])
                im = os.path.join(folder1,
                                  os.listdir(folder1)[0])  #make random
                neighbor = np.load(im)['arr_0']
                prediction = self.predict(
                    [np.array([image]),
                     np.array([neighbor])], 1)
                pred_arr += [[img[:-12], n[1][:-12], prediction]]

        for item in pred_arr:
            val_dataset.prepare_specific(1, item[0], item[1])
            f = open("ground.txt", "a")
            if item[0] == item[1]:
                f.write('1 {} {} {} '.format(item[0], item[1], item[2]))
            else:
                f.write('0 {} {} {} '.format(item[0], item[1], item[2]))
            f.close()
Exemple #29
0
def worker(A, start):
    # starttime=datetime.datetime.now()
    # endtime=datetime.datetime.now()
    # timee=endtime-starttime
    timee = 0
    num = 0
    for j in xrange(kkkk):
        k1 = 0
        #for circ in range(1000):
        # starttime=datetime.datetime.now()
        #lshtruple=lsh.query(newcomparearrt[j+start*kkkk],1)
        #print type(engine)
        lshtruple = engine.neighbours(newcomparearrt[j + start * kkkk])
        #print lshtruple
        # endtime=datetime.datetime.now()
        # timee=timee+(endtime-starttime).seconds
        # if lshtruple:
        #     print lshtruple[0]

        for f in xrange(len(CC)):
            #print CC[f]
            if lshtruple:
                if (tuple(CC[f]).__eq__(lshtruple[0][0])):
                    k1 = f
                    break

        #print k1
        length3 = len(clusresult[k1])
        temp = clusresult[k1]
        #.....................................................................................
        # lsh1=LSHash(6,3)
        # #print temp
        # ff=0
        # for ff in xrange(length3):
        #     lsh1.index(temp[ff])
        # starttime1=datetime.datetime.now()
        # if lsh1.query(newcomparearrt[j],1):
        #     num=num+1
        # endtime1=datetime.datetime.now()
        # timee=timee+(endtime1-starttime1).seconds
        # del lsh1
        #.....................................................................................
        #nearpy
        rbp1 = RandomBinaryProjections('rbp2', 10)
        DIM1 = 3
        engine1 = Engine(DIM1,
                         lshashes=[rbp1],
                         distance=CosineDistance(),
                         vector_filters=[NearestFilter(1)])
        for ff in xrange(length3):
            engine1.store_vector(temp[ff], ff)
        if engine1.candidate_count(newcomparearrt[j]):
            num = num + 1
        #print num
        #del engine
        results = engine1.neighbours(newcomparearrt[j])
        #  print results
        del engine1

        #...........................................................................
    A.append(num)
Exemple #30
0
    args = parser.parse_args()
  except IOError, msg:
    parser.error(str(msg))


  reader = codecs.getreader('utf8')
  writer = codecs.getwriter('utf8')
  infile = reader(args.infile)
  dictionaries = [reader(d) for d in args.dictionaries]
  outfile = writer(args.outfile)
  
  # make nn indices for each language
  # Create a random binary hash
  rbp = RandomBinaryProjections('rbp', args.bits)
  # create engines for each language
  engines = [Engine(args.dim, lshashes=[rbp], distance=CosineDistance(), vector_filters=[NearestFilter(args.nbest)]) for x in xrange(args.langs)]
  
  # load transformation matrices
  mats = np.load(args.modelfile)
  invmats = {}
  for name, mat in mats.items():
    invmats[name]=LA.inv(mat)
  vocabs = [np.loadtxt(dictionary, dtype=str) for dictionary in dictionaries]
  vocab = dd(lambda: dict())
  for entry in np.vstack(vocabs):
    word = entry[0]
    lang = entry[1]
    vec = entry[2:].astype(float)
    if word not in vocab[lang]:
      vocab[lang][word]=vec
      engines[int(lang)].store_vector(vec, word)
dimension = 1000

# Create permutations meta-hash
permutations2 = HashPermutationMapper('permut2')

# Create binary hash as child hash
rbp_perm2 = RandomBinaryProjections('rbp_perm2', 14)

# Add rbp as child hash of permutations hash
permutations2.add_child_hash(rbp_perm2)

engine = Engine(dimension,
                lshashes=[permutations2],
                distance=CosineDistance(),
                vector_filters=[NearestFilter(5)],
                storage=MemoryStorage())

i = 0

query = numpy.zeros(dimension)

f = open('features2.txt', 'r')
# Opening, reading from the file::
for next_read_line in f:
    next_read_line = next_read_line.rstrip()

    split_arr = next_read_line.split(" ")
    split_arr = split_arr[1:]
    split_arr = list(map(float, split_arr))
# Create data set from two clusters
vectors = []

center = numpy.random.randn(dimension)
for index in xrange(vector_count / 2):
    vector = center + 0.01 * numpy.random.randn(dimension)
    vectors.append(vector)

center = numpy.random.randn(dimension)
for index in xrange(vector_count / 2):
    vector = center + 0.01 * numpy.random.randn(dimension)
    vectors.append(vector)

# We are looking for the N closest neighbours
N = 20
nearest = NearestFilter(N)

# We will fill this array with all the engines we want to test
engines = []

print 'Creating engines...'

# We are going to test these bin widths
bin_widths = [0.01 * x for x in range(1, 5)]
# Create engines for all configurations
for bin_width in bin_widths:
    # Use four random 1-dim discretized projections
    rdp1 = RandomDiscretizedProjections('rdp1', 4, bin_width)
    rdp2 = RandomDiscretizedProjections('rdp2', 4, bin_width)
    rdp3 = RandomDiscretizedProjections('rdp3', 4, bin_width)
    rdp4 = RandomDiscretizedProjections('rdp4', 4, bin_width)