Beispiel #1
0
 def setUpClass(self):
     index = HnswIndex(self.dim)
     for i in xrange(self.data_num):
         v = [random.gauss(0, 1) for z in xrange(self.dim)]
         index.add_data(v)
     index.build(n_threads=12)
     index.save(self.model_fname)
Beispiel #2
0
    def test01_most_similar(self):
        set_log_level(1)
        model = self.load_text8_model()
        index = HnswIndex(model.L0.shape[1])
        model.normalize('item')
        for f in model.L0:
            index.add_data(f)
        index.build(n_threads=4)
        index.save('n2.bin')

        par = ParW2V(model)

        model.opt.num_workers = 1
        all_keys = model._idmanager.itemids[::][:10000]
        start_t = time.time()
        [model.most_similar(k, topk=10) for k in all_keys]
        naive_elapsed = time.time() - start_t

        par.num_workers = 4
        start_t = time.time()
        par.most_similar(all_keys, topk=10, repr=True)
        par_elapsed = time.time() - start_t

        start_t = time.time()
        par.set_hnsw_index('n2.bin', 'item')
        par.most_similar(all_keys, topk=10, repr=True)
        ann_elapsed = time.time() - start_t
        self.assertTrue(naive_elapsed > par_elapsed * 1.5 > ann_elapsed * 5.0,
                        msg=f'{naive_elapsed} > {par_elapsed} > {ann_elapsed}')
        index.unload()
        os.remove('n2.bin')
Beispiel #3
0
def example2():
    log.set_log_level(log.INFO)
    als_option = ALSOption().get_default_option()
    data_option = MatrixMarketOptions().get_default_option()
    data_option.input.main = '../tests/ext/ml-20m/main'
    data_option.input.iid = '../tests/ext/ml-20m/iid'
    data_option.data.path = './ml20m.h5py'
    data_option.data.use_cache = True

    als = ALS(als_option, data_opt=data_option)
    als.initialize()
    als.train()
    als.normalize('item')
    als.build_itemid_map()

    print(
        'Make item recommendation on als.ml20m.par.top10.tsv with Paralell(Thread=4)'
    )
    par = ParALS(als)
    par.num_workers = 4
    all_items = als._idmanager.itemids
    start_t = time.time()
    with open('als.ml20m.par.top10.tsv', 'w') as fout:
        for idx in range(0, len(all_items), 128):
            topks, _ = par.most_similar(all_items[idx:idx + 128], repr=True)
            for q, p in zip(all_items[idx:idx + 128], topks):
                fout.write('%s\t%s\n' % (q, '\t'.join(p)))
    print('took: %.3f secs' % (time.time() - start_t))

    from n2 import HnswIndex
    index = HnswIndex(als.Q.shape[1])
    for f in als.Q:
        index.add_data(f)
    index.build(n_threads=4)
    index.save('ml20m.n2.index')
    index.unload()
    print(
        'Make item recommendation on als.ml20m.par.top10.tsv with Ann(Thread=1)'
    )
    par.set_hnsw_index('ml20m.n2.index', 'item')
    par.num_workers = 4
    start_t = time.time()
    with open('als.ml20m.ann.top10.tsv', 'w') as fout:
        for idx in range(0, len(all_items), 128):
            topks, _ = par.most_similar(all_items[idx:idx + 128], repr=True)
            for q, p in zip(all_items[idx:idx + 128], topks):
                fout.write('%s\t%s\n' % (q, '\t'.join(p)))
    print('took: %.3f secs' % (time.time() - start_t))
Beispiel #4
0
class N2(BaseANN):
    def __init__(self, m, ef_construction, n_threads, ef_search, metric):
        self._m = m
        self._m0 = m * 2
        self._ef_construction = ef_construction
        self._n_threads = n_threads
        self._ef_search = ef_search
        self._index_name = os.path.join(
            INDEX_DIR, "youtube_n2_M%d_efCon%d_n_thread%s" %
            (m, ef_construction, n_threads))
        self.name = "N2_M%d_efCon%d_n_thread%s_efSearch%d" % (
            m, ef_construction, n_threads, ef_search)
        self._metric = metric

        d = os.path.dirname(self._index_name)
        if not os.path.exists(d):
            os.makedirs(d)

    def fit(self, X):
        from n2 import HnswIndex
        if self._metric == 'euclidean':
            self._n2 = HnswIndex(X.shape[1], 'L2')
        else:
            self._n2 = HnswIndex(X.shape[1])
        if os.path.exists(self._index_name):
            logging.debug("Loading index from file")
            self._n2.load(self._index_name)
        else:
            logging.debug("Index file is not exist: {0}".format(
                self._index_name))
            logging.debug("Start fitting")

            for i, x in enumerate(X):
                self._n2.add_data(x.tolist())
            self._n2.build(m=self._m,
                           max_m0=self._m0,
                           ef_construction=self._ef_construction,
                           n_threads=self._n_threads)
            self._n2.save(self._index_name)

    def query(self, v, n):
        return self._n2.search_by_vector(v.tolist(), n, self._ef_search)

    def __str__(self):
        return self.name
Beispiel #5
0
class N2(BaseANN):
    def __init__(self, m, ef_construction, n_threads, ef_search, metric, batch):
        self.name = "N2_M%d_efCon%d_n_thread%s_efSearch%d%s" % (m, ef_construction, n_threads, ef_search,
                                                                '_batch' if batch else '')
        self._m = m
        self._m0 = m * 2
        self._ef_construction = ef_construction
        self._n_threads = n_threads
        self._ef_search = ef_search
        self._index_name = os.path.join(CACHE_DIR, "index_n2_%s_M%d_efCon%d_n_thread%s"
                                        % (args.dataset, m, ef_construction, n_threads))
        self._metric = metric

    def fit(self, X):
        if self._metric == 'euclidean':
            self._n2 = HnswIndex(X.shape[1], 'L2')
        elif self._metric == 'dot':
            self._n2 = HnswIndex(X.shape[1], 'dot')
        else:
            self._n2 = HnswIndex(X.shape[1])

        if os.path.exists(self._index_name):
            n2_logger.info("Loading index from file")
            self._n2.load(self._index_name, use_mmap=False)
            return

        n2_logger.info("Create Index")
        for i, x in enumerate(X):
            self._n2.add_data(x)
        self._n2.build(m=self._m, max_m0=self._m0, ef_construction=self._ef_construction, n_threads=self._n_threads)
        self._n2.save(self._index_name)

    def query(self, v, n):
        return self._n2.search_by_vector(v, n, self._ef_search)

    def batch_query(self, X, n):
        self.b_res = self._n2.batch_search_by_vectors(X, n, self._ef_search, self._n_threads)

    def get_batch_results(self):
        return self.b_res

    def __str__(self):
        return self.name
Beispiel #6
0
from n2 import HnswIndex
import random

f = 3
t = HnswIndex(f)  # HnswIndex(f, "L2 or angular")
for i in xrange(1000):
    v = [random.gauss(0, 1) for z in xrange(f)]
    t.add_data(v)

t.build(m=5, max_m0=10, n_threads=4)
t.save('test.n2')

u = HnswIndex(f, "angular")
u.load('test.n2')

search_id = 1
k = 3
neighbor_ids = u.search_by_id(search_id, k)
print(
    "[search_by_id]: Nearest neighborhoods of id {}: {}".format(
        search_id,
        neighbor_ids))

example_vector_query = [random.gauss(0, 1) for z in xrange(f)]
nns = u.search_by_vector(example_vector_query, k, include_distances=True)
print(
    "[search_by_vector]: Nearest neighborhoods of vector {}: {}".format(
        example_vector_query,
        nns))
Beispiel #7
0
def _buffalo(algo_name, database):
    repeat = 3
    options = {'als': {'num_workers': 4,
                       'compute_loss_on_training': False,
                       'd': 32,
                       'num_iters': 10},
               'bpr': {'num_workers': 4,
                       'compute_loss_on_training': False,
                       'd': 32,
                       'num_iters': 100},
              }
    opt = options[algo_name]

    # linear
    if algo_name == 'als':
        PAR = ParALS
        model = BuffaloLib().als(database, return_instance_before_train=True, **opt)
    elif algo_name == 'bpr':
        PAR = ParBPRMF
        model = BuffaloLib().bpr(database, return_instance_before_train=True, **opt)
    model.train()
    model.build_itemid_map()
    model.normalize('item')

    # parallel
    par = PAR(model)

    # ann
    index = HnswIndex(model.P.shape[1])
    for f in model.P:
        index.add_data(f)
    index.build(n_threads=4)
    index.save('bm_n2.bin')

    ann = PAR(model)
    ann.set_hnsw_index('bm_n2.bin', 'item')

    total_queries = 10000
    keys = model._idmanager.itemids[::][:total_queries]
    print('Total queries: %s' % len(keys))
    results = {}
    nn_opts = {'topk': 10}
    for p, m in [('S', model), ('P', par), ('A', ann)]:
        results[p] = {}
        opt = nn_opts.copy()
        if not isinstance(m, PAR):
            opt['iterable'] = keys
        for num_workers in [1, 2, 4]:
            if isinstance(m, PAR):
                m.num_workers = num_workers
            else:
                m.opt.num_workers = num_workers
            opt['model'] = m
            elapsed, memory_usage = _get_elapsed_time('most_similar',
                                                      keys,
                                                      BuffaloLib(), repeat, **opt)
            s = elapsed / len(keys)
            results[p][f'S={num_workers}'] = s
            results[p][f'E={num_workers}'] = elapsed
            results[p][f'M={num_workers}'] = memory_usage['max']
            results[p][f'A={num_workers}'] = memory_usage['avg']
            results[p][f'B={num_workers}'] = memory_usage['min']
            print(f'{p}M={num_workers} {elapsed} {memory_usage}')
    return results