def setUp(self): self.V = [] self.V.append((numpy.array([0]), 'data1', 0.4)) self.V.append((numpy.array([1]), 'data2', 0.9)) self.V.append((numpy.array([2]), 'data3', 1.4)) self.V.append((numpy.array([3]), 'data4', 2.1)) self.V.append((numpy.array([4]), 'data5', 0.1)) self.V.append((numpy.array([5]), 'data6', 8.7)) self.V.append((numpy.array([6]), 'data7', 3.4)) self.V.append((numpy.array([7]), 'data8', 2.8)) self.threshold_filter = DistanceThresholdFilter(1.0) self.nearest_filter = NearestFilter(5) self.unique = UniqueFilter()
def __init__(self, dim, lshashes=None, distance=None, fetch_vector_filters=None, vector_filters=None, storage=None): """ Keeps the configuration. """ if lshashes is None: lshashes = [RandomBinaryProjections('default', 10)] self.lshashes = lshashes if distance is None: distance = EuclideanDistance() self.distance = distance if vector_filters is None: vector_filters = [NearestFilter(10)] self.vector_filters = vector_filters if fetch_vector_filters is None: fetch_vector_filters = [UniqueFilter()] self.fetch_vector_filters = fetch_vector_filters if storage is None: storage = MemoryStorage() self.storage = storage # Initialize all hashes for the data space dimension. for lshash in self.lshashes: lshash.reset(dim) print('*** engine init done ***')
def main(args): """ Main entry. """ data = Dataset(args.dataset) num, dim = data.base.shape # We are looking for the ten closest neighbours nearest = NearestFilter(args.topk) # We want unique candidates unique = UniqueFilter() # Create engines for all configurations for nbit, ntbl in itertools.product(args.nbits, args.ntbls): logging.info("Creating Engine ...") lshashes = [RandomBinaryProjections('rbp%d' % i, nbit) for i in xrange(ntbl)] # Create engine with this configuration engine = Engine(dim, lshashes=lshashes, vector_filters=[unique, nearest]) logging.info("\tDone!") logging.info("Adding items ...") for i in xrange(num): engine.store_vector(data.base[i, :], i) if i % 100000 == 0: logging.info("\t%d/%d" % (i, data.nbae)) logging.info("\tDone!") ids = np.zeros((data.nqry, args.topk), np.int) logging.info("Searching ...") tic() for i in xrange(data.nqry): reti = [y for x, y, z in np.array(engine.neighbours(data.query[i]))] ids[i, :len(reti)] = reti if i % 100 == 0: logging.info("\t%d/%d" % (i, data.nqry)) time_costs = toc() logging.info("\tDone!") report = os.path.join(args.exp_dir, "report.txt") with open(report, "a") as rptf: rptf.write("*" * 64 + "\n") rptf.write("* %s\n" % time.asctime()) rptf.write("*" * 64 + "\n") r_at_k = compute_stats(data.groundtruth, ids, args.topk)[-1][-1] with open(report, "a") as rptf: rptf.write("=" * 64 + "\n") rptf.write("index_%s-nbit_%d-ntbl_%d\n" % ("NearPy", nbit, ntbl)) rptf.write("-" * 64 + "\n") rptf.write("recall@%-8d%.4f\n" % (args.topk, r_at_k)) rptf.write("time cost (ms): %.3f\n" % (time_costs * 1000 / data.nqry))
class TestVectorFilters(unittest.TestCase): def setUp(self): self.V = [] self.V.append((numpy.array([0]), 'data1', 0.4)) self.V.append((numpy.array([1]), 'data2', 0.9)) self.V.append((numpy.array([2]), 'data3', 1.4)) self.V.append((numpy.array([3]), 'data4', 2.1)) self.V.append((numpy.array([4]), 'data5', 0.1)) self.V.append((numpy.array([5]), 'data6', 8.7)) self.V.append((numpy.array([6]), 'data7', 3.4)) self.V.append((numpy.array([7]), 'data8', 2.8)) self.threshold_filter = DistanceThresholdFilter(1.0) self.nearest_filter = NearestFilter(5) self.unique = UniqueFilter() def test_thresholding(self): result = self.threshold_filter.filter_vectors(self.V) self.assertEqual(len(result), 3) self.assertIn(self.V[0], result) self.assertIn(self.V[1], result) self.assertIn(self.V[4], result) def test_nearest(self): result = self.nearest_filter.filter_vectors(self.V) self.assertEqual(len(result), 5) self.assertIn(self.V[0], result) self.assertIn(self.V[1], result) self.assertIn(self.V[4], result) self.assertIn(self.V[2], result) self.assertIn(self.V[3], result) def test_unique(self): W = self.V W.append((numpy.array([7]), 'data8', 2.8)) W.append((numpy.array([0]), 'data1', 2.8)) W.append((numpy.array([1]), 'data2', 2.8)) W.append((numpy.array([6]), 'data7', 2.8)) result = self.unique.filter_vectors(W) self.assertEqual(len(result), 8)
class TestVectorFilters(unittest.TestCase): def setUp(self): self.V = [] self.V.append((numpy.array([0]), 'data1', 0.4)) self.V.append((numpy.array([1]), 'data2', 0.9)) self.V.append((numpy.array([2]), 'data3', 1.4)) self.V.append((numpy.array([3]), 'data4', 2.1)) self.V.append((numpy.array([4]), 'data5', 0.1)) self.V.append((numpy.array([5]), 'data6', 8.7)) self.V.append((numpy.array([6]), 'data7', 3.4)) self.V.append((numpy.array([7]), 'data8', 2.8)) self.threshold_filter = DistanceThresholdFilter(1.0) self.nearest_filter = NearestFilter(5) self.unique = UniqueFilter() def test_thresholding(self): result = self.threshold_filter.filter_vectors(self.V) self.assertEqual(len(result), 3) self.assertTrue(self.V[0] in result) self.assertTrue(self.V[1] in result) self.assertTrue(self.V[4] in result) def test_nearest(self): result = self.nearest_filter.filter_vectors(self.V) self.assertEqual(len(result), 5) self.assertTrue(self.V[0] in result) self.assertTrue(self.V[1] in result) self.assertTrue(self.V[4] in result) self.assertTrue(self.V[2] in result) self.assertTrue(self.V[3] in result) def test_unique(self): W = self.V W.append((numpy.array([7]), 'data8', 2.8)) W.append((numpy.array([0]), 'data1', 2.8)) W.append((numpy.array([1]), 'data2', 2.8)) W.append((numpy.array([6]), 'data7', 2.8)) result = self.unique.filter_vectors(W) self.assertEqual(len(result), 8)
def __init__(self, matrix, max_neighbours=20, lshashes=[RandomBinaryProjections("rbp", 10)], vector_filters=[UniqueFilter()], distance=Pearson()): if not isinstance(lshashes, list): raise TypeError("'lshashes' must be an instance of 'list'") if not isinstance(vector_filters, list): raise TypeError("'vector_filters' must be an instance of 'list'") self.underlying = Engine(len(matrix[0]), lshashes=lshashes, vector_filters=vector_filters + [NearestFilter(max_neighbours)], distance=distance) for vector in matrix: self.underlying.store_vector(vector)
engine1 = Engine(dimension - 1, lshashes=[rbp, rbp, rbp], storage=MemoryStorage(), distance=EuclideanDistance(), vector_filters=[NearestFilter(100)]) engine1.store_many_vectors(dataBase, [i for i in range(featureNum)]) begin_time = time() print(' 预测值 误差') err = [] for m in range(len(queryBase)): query = queryBase[m] N = engine1.neighbours(query, distance='euclidean', fetch_vector_filters=[UniqueFilter()]) index = [int(x[1]) for x in N] # print(index) data = np.array([dataBaseInitial.iloc[index, :]]) data = data[0] # print(data.shape) query = np.array([queryBase[m]]) # print(query.shape) # 高斯回归 kernel = C(0.1, (0.001, 0.1)) * RBF(0.5, (1e-4, 10)) reg = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10, alpha=0.01) reg.fit(data[:, 1:], data[:, 0]) output = reg.predict(query)