Ejemplo n.º 1
1
def test_string_loaded():
    DATA_STRS = ["xyz", "beagcfa", "cea", "cb",
                 "d", "c", "bdaf", "ddcd",
                 "egbfa", "a", "fba", "bcccfe",
                 "ab", "bfgbfdc", "bcbbgf", "bfbb"
                 ]
    QUERY_STRS = ["abc", "def", "ghik"]
    space_type = 'leven'
    space_param = []
    method_name = 'small_world_rand'
    index_name  = method_name + '.index'

    index = nmslib.init(
                             space_type,
                             space_param,
                             method_name,
                             nmslib.DataType.OBJECT_AS_STRING,
                             nmslib.DistType.INT)

    for id, data in enumerate(DATA_STRS):
        nmslib.addDataPoint(index, id, data)

    print 'Let\'s print a few data entries'
    print 'We have added %d data points' % nmslib.getDataPointQty(index)

    for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))):
        print nmslib.getDataPoint(index,i)

    print 'Let\'s invoke the index-build process'

    index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4']
    query_time_param = ['initSearchAttempts=3']


    nmslib.loadIndex(index, index_name)

    print "The index %s is loaded" % index_name

    nmslib.setQueryTimeParams(index, query_time_param)

    print 'Query time parameters are set'

    print "Results for the loaded index:"

    k = 2
    for idx, data in enumerate(QUERY_STRS):
        print idx, nmslib.knnQuery(index, k, data)

    nmslib.freeIndex(index)
Ejemplo n.º 2
0
def test_vector_load(fast=True, fast_batch=True, seq=True):
    space_type = 'cosinesimil'
    space_param = []
    method_name = 'small_world_rand'
    index_name  = method_name + '.index'
    if os.path.isfile(index_name):
        os.remove(index_name)
    f = '/tmp/foo.txt'
    if not os.path.isfile(f):
        print 'creating %s' % f
        np.savetxt(f, np.random.rand(100000,1000), delimiter="\t")
        print 'done'

    if fast:
        index = nmslib.init(
                             space_type,
                             space_param,
                             method_name,
                             nmslib.DataType.DENSE_VECTOR,
                             nmslib.DistType.FLOAT)
        with TimeIt('fast add data point'):
            data = read_data_fast(f)
            nmslib.addDataPointBatch(index, np.arange(len(data), dtype=np.int32), data)
        nmslib.freeIndex(index)

    if fast_batch:
        index = nmslib.init(
                             space_type,
                             space_param,
                             method_name,
                             nmslib.DataType.DENSE_VECTOR,
                             nmslib.DistType.FLOAT)
        with TimeIt('fast_batch add data point'):
            offset = 0
            for data in read_data_fast_batch(f, 10000):
                nmslib.addDataPointBatch(index, np.arange(len(data), dtype=np.int32) + offset, data)
                offset += data.shape[0]
        print 'offset', offset
        nmslib.freeIndex(index)

    if seq:
        index = nmslib.init(
                             space_type,
                             space_param,
                             method_name,
                             nmslib.DataType.DENSE_VECTOR,
                             nmslib.DistType.FLOAT)
        with TimeIt('seq add data point'):
            for id, data in enumerate(read_data(f)):
                nmslib.addDataPoint(index, id, data)
        nmslib.freeIndex(index)
Ejemplo n.º 3
0
def test_string_fresh(batch=True):
    DATA_STRS = ["xyz", "beagcfa", "cea", "cb",
                 "d", "c", "bdaf", "ddcd",
                 "egbfa", "a", "fba", "bcccfe",
                 "ab", "bfgbfdc", "bcbbgf", "bfbb"
                 ]
    QUERY_STRS = ["abc", "def", "ghik"]
    space_type = 'leven'
    space_param = []
    method_name = 'small_world_rand'
    index_name  = method_name + '.index'

    index = nmslib.init(
                             space_type,
                             space_param,
                             method_name,
                             nmslib.DataType.OBJECT_AS_STRING,
                             nmslib.DistType.INT)

    if batch:
        print 'DATA_STRS', DATA_STRS
        positions = nmslib.addDataPointBatch(index, np.arange(len(DATA_STRS), dtype=np.int32), DATA_STRS)
    else:
        for id, data in enumerate(DATA_STRS):
            nmslib.addDataPoint(index, id, data)

    print 'Let\'s print a few data entries'
    print 'We have added %d data points' % nmslib.getDataPointQty(index)

    for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))):
        print nmslib.getDataPoint(index,i)

    print 'Let\'s invoke the index-build process'

    index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4']
    query_time_param = ['initSearchAttempts=3']

    nmslib.createIndex(index, index_param)
    nmslib.setQueryTimeParams(index, query_time_param)

    print 'Query time parameters are set'

    print "Results for the freshly created index:"

    k = 2
    if batch:
        num_threads = 10
        res = nmslib.knnQueryBatch(index, num_threads, k, QUERY_STRS)
    for idx, data in enumerate(QUERY_STRS):
        res = nmslib.knnQuery(index, k, data)
        print idx, data, res, [DATA_STRS[i] for i in res]

    nmslib.saveIndex(index, index_name)

    print "The index %s is saved" % index_name

    nmslib.freeIndex(index)
Ejemplo n.º 4
0
 def setUp(self):
     space_type = "normleven"
     space_param = []
     method_name = "small_world_rand"
     index_name = method_name + ".index"
     if os.path.isfile(index_name):
         os.remove(index_name)
     self.index = nmslib.init(
         space_type, space_param, method_name, nmslib.DataType.OBJECT_AS_STRING, nmslib.DistType.FLOAT
     )
Ejemplo n.º 5
0
 def setUp(self):
     space_type = "cosinesimil_sparse"
     space_param = []
     method_name = "small_world_rand"
     index_name = method_name + ".index"
     if os.path.isfile(index_name):
         os.remove(index_name)
     self.index = nmslib.init(
         space_type, space_param, method_name, nmslib.DataType.SPARSE_VECTOR, nmslib.DistType.FLOAT
     )
Ejemplo n.º 6
0
    def fit(self, Ciu):
        # nmslib can be a little chatty when first imported, disable some of
        # the logging
        logging.getLogger('nmslib').setLevel(logging.WARNING)
        import nmslib

        # train the model
        super(NMSLibAlternatingLeastSquares, self).fit(Ciu)

        # create index for similar_items
        if self.approximate_similar_items:
            log.debug("Building nmslib similar items index")
            self.similar_items_index = nmslib.init(
                method=self.method, space='cosinesimil')

            # there are some numerical instability issues here with
            # building a cosine index with vectors with 0 norms, hack around this
            # by just not indexing them
            norms = numpy.linalg.norm(self.item_factors, axis=1)
            ids = numpy.arange(self.item_factors.shape[0])

            # delete zero valued rows from the matrix
            item_factors = numpy.delete(self.item_factors, ids[norms == 0], axis=0)
            ids = ids[norms != 0]

            self.similar_items_index.addDataPointBatch(item_factors, ids=ids)
            self.similar_items_index.createIndex(self.index_params,
                                                 print_progress=self.show_progress)
            self.similar_items_index.setQueryTimeParams(self.query_params)

        # build up a separate index for the inner product (for recommend
        # methods)
        if self.approximate_recommend:
            log.debug("Building nmslib recommendation index")
            self.max_norm, extra = augment_inner_product_matrix(
                self.item_factors)
            self.recommend_index = nmslib.init(
                method='hnsw', space='cosinesimil')
            self.recommend_index.addDataPointBatch(extra)
            self.recommend_index.createIndex(self.index_params, print_progress=self.show_progress)
            self.recommend_index.setQueryTimeParams(self.query_params)
Ejemplo n.º 7
0
    def fit(self, X):
        if self._method_name == 'vptree':
            # To avoid this issue:
            # terminate called after throwing an instance of 'std::runtime_error'
            # what():  The data size is too small or the bucket size is too big. Select the parameters so that <total # of records> is NOT less than <bucket size> * 1000
            # Aborted (core dumped)
            self._method_param.append('bucketSize=%d' % min(int(X.shape[0] * 0.0005), 1000))

        self._index = nmslib.init(space=self._nmslib_metric, method=self._method_name)
        self._index.addDataPointBatch(X)

        nmslib.createIndex(self._index, self._method_param)
Ejemplo n.º 8
0
def test_object_as_string_fresh(batch=True):
    space_type = 'cosinesimil'
    space_param = []
    method_name = 'small_world_rand'
    index_name  = method_name + '.index'
    if os.path.isfile(index_name):
        os.remove(index_name)
    index = nmslib.init(
                             space_type,
                             space_param,
                             method_name,
                             nmslib.DataType.OBJECT_AS_STRING,
                             nmslib.DistType.FLOAT)

    if batch:
        data = [s for s in read_data_as_string('sample_dataset.txt')]
        positions = nmslib.addDataPointBatch(index, np.arange(len(data), dtype=np.int32), data)
    else:
        for id, data in enumerate(read_data_as_string('sample_dataset.txt')):
            nmslib.addDataPoint(index, id, data)

    print 'Let\'s print a few data entries'
    print 'We have added %d data points' % nmslib.getDataPointQty(index)

    for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))):
       print nmslib.getDataPoint(index, i)

    print 'Let\'s invoke the index-build process'

    index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4']
    query_time_param = ['initSearchAttempts=3']

    nmslib.createIndex(index, index_param)

    print 'The index is created'

    nmslib.setQueryTimeParams(index,query_time_param)

    print 'Query time parameters are set'

    print "Results for the freshly created index:"

    k = 3

    for idx, data in enumerate(read_data_as_string('sample_queryset.txt')):
        print idx, nmslib.knnQuery(index, k, data)

    nmslib.saveIndex(index, index_name)

    print "The index %s is saved" % index_name

    nmslib.freeIndex(index)
Ejemplo n.º 9
0
 def setUp(self):
     space_type = 'leven'
     space_param = []
     method_name = 'small_world_rand'
     index_name = method_name + '.index'
     if os.path.isfile(index_name):
         os.remove(index_name)
     self.index = nmslib.init(
         space_type,
         space_param,
         method_name,
         nmslib.DataType.OBJECT_AS_STRING,
         nmslib.DistType.INT)
Ejemplo n.º 10
0
 def setUp(self):
     space_type = 'cosinesimil'
     space_param = []
     method_name = 'small_world_rand'
     index_name = method_name + '.index'
     if os.path.isfile(index_name):
         os.remove(index_name)
     self.index = nmslib.init(
         space_type,
         space_param,
         method_name,
         nmslib.DataType.DENSE_VECTOR,
         nmslib.DistType.FLOAT)
Ejemplo n.º 11
0
    def fit(self, X):
        if self._method_name == 'vptree':
            # To avoid this issue:
            # terminate called after throwing an instance of 'std::runtime_error'
            # what():  The data size is too small or the bucket size is too big. Select the parameters so that <total # of records> is NOT less than <bucket size> * 1000
            # Aborted (core dumped)
            self._method_param.append('bucketSize=%d' % min(int(X.shape[0] * 0.0005), 1000))
                                        
        self._index = nmslib.init(self._nmslib_metric, [], self._method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT)
    
        for i, x in enumerate(X):
            nmslib.addDataPoint(self._index, i, x.tolist())

        nmslib.createIndex(self._index, self._method_param)
Ejemplo n.º 12
0
def test_sparse_vector_fresh():
    space_type = 'cosinesimil_sparse'
    space_param = []
    method_name = 'small_world_rand'
    index_name  = method_name + '_sparse.index'
    if os.path.isfile(index_name):
        os.remove(index_name)
    index = nmslib.init(
                             space_type,
                             space_param,
                             method_name,
                             nmslib.DataType.SPARSE_VECTOR,
                             nmslib.DistType.FLOAT)

    for id, data in enumerate(read_sparse_data('sample_sparse_dataset.txt')):
        nmslib.addDataPoint(index, id, data)

    print 'We have added %d data points' % nmslib.getDataPointQty(index)

    for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))):
       print nmslib.getDataPoint(index,i)

    print 'Let\'s invoke the index-build process'

    index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4']
    query_time_param = ['initSearchAttempts=3']

    nmslib.createIndex(index, index_param)

    print 'The index is created'

    nmslib.setQueryTimeParams(index,query_time_param)

    print 'Query time parameters are set'

    print "Results for the freshly created index:"

    k = 3

    for idx, data in enumerate(read_sparse_data('sample_sparse_queryset.txt')):
        print idx, nmslib.knnQuery(index, k, data)

    nmslib.saveIndex(index, index_name)

    print "The index %s is saved" % index_name

    nmslib.freeIndex(index)
Ejemplo n.º 13
0
    def testSparse(self):
        index = nmslib.init(method='small_world_rand', space='cosinesimil_sparse',
                            data_type=nmslib.DataType.SPARSE_VECTOR)

        index.addDataPoint(0, [(1, 2.), (2, 3.)])
        index.addDataPoint(1, [(0, 1.), (1, 2.)])
        index.addDataPoint(2, [(2, 3.), (3, 3.)])
        index.addDataPoint(3, [(3, 1.)])

        index.createIndex()

        ids, distances = index.knnQuery([(1, 2.), (2, 3.)])
        self.assertEqual(ids[0], 0)
        self.assertEqual(distances[0], 0)

        self.assertEqual(len(index), 4)
        self.assertEqual(index[3], [(3, 1.0)])
Ejemplo n.º 14
0
def test_vector_loaded():
    space_type = 'cosinesimil'
    space_param = []
    method_name = 'small_world_rand'
    index_name  = method_name + '.index'
    index = nmslib.init(
                             space_type,
                             space_param,
                             method_name,
                             nmslib.DataType.DENSE_VECTOR,
                             nmslib.DistType.FLOAT)

    for id, data in enumerate(read_data('sample_dataset.txt')):
        pos = nmslib.addDataPoint(index, id, data)
	if id != pos:
            print 'id %s != pos %s' % (id, pos)
	    sys.exit(1)

    print 'Let\'s print a few data entries'
    print 'We have added %d data points' % nmslib.getDataPointQty(index)

    for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))):
       print nmslib.getDataPoint(index,i)

    print 'Let\'s invoke the index-build process'


    query_time_param = ['initSearchAttempts=3']

    nmslib.loadIndex(index, index_name)

    print "The index %s is loaded" % index_name

    nmslib.setQueryTimeParams(index,query_time_param)

    print 'Query time parameters are set'

    print "Results for the loaded index"

    k = 2
    for idx, data in enumerate(read_data('sample_queryset.txt')):
        print idx, nmslib.knnQuery(index, k, data)

    nmslib.freeIndex(index)
Ejemplo n.º 15
0
    def fit(self, X):
        if self._method_name == 'vptree':
            # To avoid this issue:
            # terminate called after throwing an instance of 'std::runtime_error'
            # what():  The data size is too small or the bucket size is too big. Select the parameters so that <total # of records> is NOT less than <bucket size> * 1000
            # Aborted (core dumped)
            self._index_param.append('bucketSize=%d' % min(int(X.shape[0] * 0.0005), 1000))

        self._index = nmslib.init(space=self._nmslib_metric, method=self._method_name)
        self._index.addDataPointBatch(X)

        if os.path.exists(self._index_name):
            print('Loading index from file')
            self._index.loadIndex(self._index_name)
        else:
            self._index.createIndex(self._index_param)
            if self._save_index:
                self._index.saveIndex(self._index_name)

        self._index.setQueryTimeParams(self._query_param)
Ejemplo n.º 16
0
    def testStringLeven(self):
        index = nmslib.init(space='leven',
                            dtype=nmslib.DistType.INT,
                            data_type=nmslib.DataType.OBJECT_AS_STRING,
                            method='small_world_rand')

        strings = [''.join(x) for x in itertools.permutations(['a', 't', 'c', 'g'])]

        index.addDataPointBatch(strings)

        index.addDataPoint(len(index), "atat")
        index.addDataPoint(len(index), "gaga")
        index.createIndex()

        for i, distance in zip(*index.knnQuery(strings[0])):
            self.assertEqual(index.getDistance(0, i), distance)

        self.assertEqual(len(index), len(strings) + 2)
        self.assertEqual(index[0], strings[0])
        self.assertEqual(index[len(index)-2], 'atat')
Ejemplo n.º 17
0
def bench_sparse_vector(batch=True):
    # delay importing these so CI can import module
    from scipy.sparse import csr_matrix
    from scipy.spatial import distance
    from pysparnn.cluster_index import MultiClusterIndex

    dim = 20000
    dataset = np.random.binomial(1, 0.01, size=(40000, dim))
    queryset = np.random.binomial(1, 0.009, size=(1000, dim))

    print('dataset[0]:', [[i, v] for i, v in enumerate(dataset[0]) if v > 0])

    k = 3

    q0 = queryset[0]
    res = []
    for i in range(dataset.shape[0]):
        res.append([i, distance.cosine(q0, dataset[i, :])])
    res.sort(key=lambda x: x[1])
    print('q0 res', res[:k])

    data_matrix = csr_matrix(dataset, dtype=np.float32)
    query_matrix = csr_matrix(queryset, dtype=np.float32)

    data_to_return = range(dataset.shape[0])

    with TimeIt('building MultiClusterIndex'):
        cp = MultiClusterIndex(data_matrix, data_to_return)

    with TimeIt('knn search'):
        res = cp.search(query_matrix, k=k, return_distance=False)

    print(res[:5])
    for i in res[0]:
        print(int(i), distance.cosine(q0, dataset[int(i), :]))

    #space_type = 'cosinesimil_sparse'
    space_type = 'cosinesimil_sparse_fast'
    space_param = []
    method_name = 'small_world_rand'
    index_name = method_name + '_sparse.index'
    if os.path.isfile(index_name):
        os.remove(index_name)
    index = nmslib.init(space_type, space_param, method_name,
                        nmslib.DataType.SPARSE_VECTOR, nmslib.DistType.FLOAT)

    if batch:
        with TimeIt('batch add'):
            positions = nmslib.addDataPointBatch(
                index, np.arange(len(dataset), dtype=np.int32), data_matrix)
        print('positions', positions)
    else:
        d = []
        q = []
        with TimeIt('preparing'):
            for data in dataset:
                d.append([[i, v] for i, v in enumerate(data) if v > 0])
            for data in queryset:
                q.append([[i, v] for i, v in enumerate(data) if v > 0])
        with TimeIt('adding points'):
            for id, data in enumerate(d):
                nmslib.addDataPoint(index, id, data)

    print('Let\'s invoke the index-build process')

    index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4']
    query_time_param = ['initSearchAttempts=3']

    with TimeIt('building index'):
        nmslib.createIndex(index, index_param)

    print('The index is created')

    nmslib.setQueryTimeParams(index, query_time_param)

    print('Query time parameters are set')

    print("Results for the freshly created index:")

    with TimeIt('knn query'):
        if batch:
            num_threads = 10
            res = nmslib.knnQueryBatch(index, num_threads, k, query_matrix)
            for idx, v in enumerate(res):
                if idx < 5:
                    print(idx, v)
                if idx == 0:
                    for i in v:
                        print('q0', i, distance.cosine(q0, dataset[i, :]))
        else:
            for idx, data in enumerate(q):
                res = nmslib.knnQuery(index, k, data)
                if idx < 5:
                    print(idx, res)

    nmslib.saveIndex(index, index_name)

    print("The index %s is saved" % index_name)

    nmslib.freeIndex(index)
Ejemplo n.º 18
0
import time
import math
import pdb
from xclib.data import data_utils
import hnswlib

lbl_ft_file = sys.argv[1]
model_file = sys.argv[2]
M = int(sys.argv[3])
efC = int(sys.argv[4])
num_threads = int(sys.argv[5])
num_ft = int(sys.argv[6])
metric_space = sys.argv[7]

start = time.time()
data = data_utils.read_sparse_file(lbl_ft_file)
end = time.time()
start = time.time()
index = nmslib.init(method='hnsw',
                    space='cosinesimil_sparse',
                    data_type=nmslib.DataType.SPARSE_VECTOR)
index.addDataPointBatch(data)
index.createIndex({
    'M': M,
    'indexThreadQty': num_threads,
    'efConstruction': efC
})
end = time.time()
print('Training time of ANNS datastructure = %f' % (end - start))
nmslib.saveIndex(index, model_file)
Ejemplo n.º 19
0
import dlib, os, shutil
import numpy as np
from skimage import io
from scipy.spatial import distance
import pickle
import nmslib

index = nmslib.init(method='hnsw', space='l2', data_type=nmslib.DataType.DENSE_VECTOR)

files = os.listdir('npy') 

es = []  
e=0

ff=open('associations.txt', 'w')

for x in files: 
    e=e+1
    name, _ = os.path.splitext(x)
    embedding=np.load('npy/'+x)
    ff.write(str(e)+'|'+x+'\n')
    index.addDataPoint(e, embedding)

    
index_time_params = {
    'indexThreadQty': 4,
    'skip_optimized_index': 0,
    'post': 2,
    'delaunay_type': 1,
    'M': 100,
    'efConstruction': 2000
Ejemplo n.º 20
0
def create_and_search_index(retcfg, jobs):

    batch_size = -1

    q_features = load_features(retcfg['path']['qfeature'])
    q_namelist = np.loadtxt(retcfg['path']['qlist'],
                            dtype=dict(names=('qname', 'nfeat'),
                                       formats=('U100', np.int32)))

    assert q_features.shape[0] == np.sum(q_namelist['nfeat']), "Inconsistent number of features sum and size of" \
                                                               "query features array"

    norm = retcfg.get('feature', 'norm', fallback=None)

    db_features = load_features(retcfg['path']['dbfeature'])
    if norm:
        db_features = normalize(db_features, norm)
        q_features = normalize(q_features, norm)

    outdir = retcfg['path']['outdir'] + "queryfiles/"
    safe_create_dir(outdir)

    index_type = retcfg['index']['index_type']
    dist_type = retcfg['index']['dist_type']

    knn = retcfg.getint('search', 'knn')
    M = retcfg.getint('index', 'M', fallback=20)
    efC = retcfg.getint('index', 'efC', fallback=20)

    print(" -- Creating <{0:s}> NN index".format(index_type))
    print("     -> KNN: {0:d}".format(knn))
    print("     -> Metric: {0:s}\n".format(dist_type))

    nnidx = nmslib.init(method=index_type, space=dist_type)
    nnidx.addDataPointBatch(db_features)
    del db_features
    nnidx.createIndex({'post': 2}, print_progress=True)
    nnidx.setQueryTimeParams({'efSearch': knn})

    if batch_size == -1:
        batch_size = q_features.shape[0]

    n_batches = int(np.ceil(q_features.shape[0] / batch_size))

    for i in tqdm(range(n_batches), ncols=100, desc='Batch', total=n_batches):
        s = i * batch_size
        e = s + batch_size
        batch_q_features = q_features[s:e]

        neighbours = nnidx.knnQueryBatch(batch_q_features,
                                         k=10,
                                         num_threads=jobs)
        neighbours = list(zip(*neighbours))

    indices = np.array(neighbours[0])
    distances = np.array(neighbours[1])

    s = 0
    for qname, n in q_namelist:

        qdists = distances[s:s + n]
        qidx = indices[s:s + n]

        matchfpath = "{0:s}{1:s}.matches".format(outdir, qname)
        distfpath = "{0:s}{1:s}.dist".format(outdir, qname)

        print(qname, "-> ", s, ":", s + n)
        print("   |_ dists: ", qdists.shape)
        print("   |_ indices: ", qidx.shape, end="\n---\n")
        np.save(matchfpath + ".npy", qidx)
        np.save(distfpath + ".npy", qdists)
        s += n

    print("---", flush=True)
Ejemplo n.º 21
0
import nmslib

import utils

external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']

app = dash.Dash(__name__, external_stylesheets=external_stylesheets)
server = app.server

# ------------- loading index -----------
os.system("setup.py")

# reading the arxiv data
df = pd.read_csv("data/arxiv_smaller.csv")

index_title = nmslib.init(method='hnsw', space='cosinesimil')
index_author = nmslib.init(method='hnsw', space='cosinesimil')
index_categories = nmslib.init(method='hnsw', space='cosinesimil')

index_title.loadIndex("index_title.bin")
index_author.loadIndex("index_author.bin")
index_categories.loadIndex("index_categories.bin")

# ------------- Define layout for the app ----------------

app.layout = html.Div([
    dcc.Tabs(id='tabs-nav',
             value='tab-1',
             children=[
                 dcc.Tab(label='Search engine', value='tab-1'),
                 dcc.Tab(label='Data', value='tab-2'),
Ejemplo n.º 22
0
 def fit(self, data):
     self.index = nmslib.init(method=self.method, space='cosinesimil')
     self.index.addDataPointBatch(data)
     self.index.createIndex(self.indexparams, print_progress=True)
Ejemplo n.º 23
0

algorithm = []
construciotnTimes=[]
searchTimes=[]
reacll = []
k = 100
avgdistances = []

constructionClocks = []
searchClocks = []
clockAlg = []
##vp-tree

import nmslib
vptree = nmslib.init(method='vptree', space='l2')

startTime = process_time()
vptree.addDataPointBatch(train)
vptree.createIndex({'bucketSize' : 10000,'selectPivotAttempts':10})
end_time = process_time()
constructionTime = end_time - startTime

# get all nearest neighbours for all the datapoint
# using a pool of 4 threads to compute
for maxLeave in [30]:#[2,10,15,20,25]:
  
    vptree.setQueryTimeParams({'maxLeavesToVisit':maxLeave,'alphaLeft':1.1,'alphaRight':1.1})
    startTime = process_time()
    neighbours = vptree.knnQueryBatch(query,k=100, num_threads=2 )
    end_time = process_time()
Ejemplo n.º 24
0
def find_edges(input, test, K, cluster_ids, query_ids):
    print(f"\tbuilding kNN classifier ... ", end=" ")
    st_time = time.time()

    if kNN_type in [1, 2]:
        input, test = input.todense(), test.todense()

    if kNN_type == 1:
        from sklearn.neighbors import NearestNeighbors
        tree = NearestNeighbors(n_neighbors=K + 1, algorithm='ball_tree').fit(input)
    elif kNN_type == 2:
        from scipy import spatial
        tree = spatial.KDTree(input)
    elif kNN_type == 3:
        from n2 import HnswIndex
        tree = HnswIndex(input.shape[1], distance_type) # distance_type in ['angular', 'L2']
        for index in tqdm(range(input.shape[0])):
            tree.add_data(input[index, :])
        tree.build(n_threads=20)
    elif kNN_type == 4:
        import pysparnn.cluster_index as ci
        input_num = input.shape[0]
        tree = ci.MultiClusterIndex(input, range(input_num))
    elif kNN_type == 5:
        import nmslib
        M, efC, num_threads = 30, 100, 10
        index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post': 0}
        space_name = 'cosinesimil_sparse'
        data_type = nmslib.DataType.SPARSE_VECTOR
        tree = nmslib.init(method='hnsw', space=space_name, data_type=data_type)
        
        print(f"type(input) = {type(input)} type(test)={type(test)}", end=" ")
        
        tree.addDataPointBatch(input)

        tree.createIndex(index_time_params)
        # Setting query-time parameters
        efS = 100
        query_time_params = {'efSearch': efS}
        print('Setting query-time parameters', query_time_params)
        tree.setQueryTimeParams(query_time_params)

    else:
        raise NotImplementedError
    print(f"time={time.time()-st_time:.3f}s")


    print("\tfinding indices ... ", end=" ")
    if kNN_type == 1:
        _, indices = tree.kneighbors(test)
    elif kNN_type == 2:
        _, indices = tree.query(test, k=K + 1)
    elif kNN_type == 3:
        indices = []
        for i in tqdm(range(test.shape[0])):
            indices.append(tree.search_by_vector(test[i, :], k=K + 1))
    elif kNN_type == 4:
        indices = tree.search(test, k=K+1, k_clusters=100, return_distance=False)
    elif kNN_type == 5:
        indices_ = tree.knnQueryBatch(test, k=K, num_threads=num_threads)
        indices = [i[0] for i in indices_]
        del indices_
    else:
        raise NotImplementedError

    print(f"time={time.time()-st_time:.3f}s")


    edge_list = []
    for index1, per in enumerate(indices):
        for index2 in per:
            index2 = int(index2)
            if index1 != index2:
                edge_list.append((query_ids[index1], center_ids[index2]))
    print(f"\tdone! .... time={time.time()-st_time:.3f}s")
    return edge_list
Ejemplo n.º 25
0
                                   model_config.inputs['share']['preprocess'])
else:
    pre = engine.load_preprocessor(preprocess_dir, model_config.net_name)

model_config.inputs['share']['custom_corpus'] = os.path.join(
    base_dir, model_config.inputs['share']['custom_corpus'])
docs, embeds = build_document_embeddings(config)

logger.info("Loading search index...")
index_name = 'custom_index'
if not os.path.exists(index_name):
    logger.info("Search index not found. Building it...")
    search_engine = build_search_index(embeds)
    search_engine.saveIndex(index_name)
else:
    search_engine = nmslib.init(method='hnsw', space='cosinesimil')
    search_engine.loadIndex(index_name)

logger.info("Model ready to query.")


@hug.cli()
@hug.get(examples='query=how%20to%20connect%20to%20printer')
@hug.local()
def search(query: hug.types.text):
    sparse_input = pre.transform_list([query])[0]
    sparse_input = np.expand_dims(sparse_input, axis=0)
    dense_input = embed_model.predict(sparse_input)[0]

    idxs, dists = search_engine.knnQuery(dense_input, k=3)
    res = []
Ejemplo n.º 26
0
    c2v.vectorize_dict(dict_path, encoding_type=encoding_type)
if not os.path.exists('./dict_index_{}.bin'.format(emb_size)):
    c2v.create_index(emb_size)
"""Loading necessary resources"""
dictionnary = []
with open(dict_path, 'r', encoding=encoding_type) as file:
    for line in file:
        dictionnary.append(line.strip())
        if dict_path.endswith(".json"):
            dictionnary = json.load(file)
        else:
            dictionnary = []
            for line in file:
                dictionnary.append(line.strip())

index = nmslib.init(method="hnsw", space="cosinesimil")
index.loadIndex('./dict_index_{}.bin'.format(emb_size))
c2v_model = c2v.load_model("train_fr_150")

import time

if len(sys.argv) == 1:
    """ K-nearest-neigbors search"""
    print("\nEdit distance 1:")
    stamp = time.time()
    requests1 = []
    requests1.append(c2v.find_knn("langage", dictionnary, c2v_model, index))
    requests1.append(c2v.find_knn("langqge", dictionnary, c2v_model, index))
    requests1.append(c2v.find_knn("langagee", dictionnary, c2v_model, index))
    time1 = (time.time() - stamp)
Ejemplo n.º 27
0
 def nmslib_init():
     """Initializes an nmslib index object"""
     index = nmslib.init(method='hnsw', space='cosinesimil')
     return index
Ejemplo n.º 28
0
    return relevant / total


if __name__ == '__main__':
    # load data
    annoy_metrics = 'angular'
    annoy_metrics = 'euclidean'
    start_scratch = True
    if start_scratch:
        df = pd.read_csv('user_factor.csv', header=None)
        df = df.values[:, 1:]
        num_users, ranks = df.shape
        t = AnnoyIndex(ranks, metric=annoy_metrics)
        t.load('tree_50')
        space_name = 'l2'
        index = nmslib.init(method='hnsw', space=space_name)
        index.addDataPointBatch(df)
        index.loadIndex('hnsw_index80.bin')

        # Set index parameters
        # These are the most important onese
        NN = 50
        efC = 100

        num_threads = 4
        index_time_params = {
            'NN': NN,
            'indexThreadQty': num_threads,
            'efConstruction': efC
        }
Ejemplo n.º 29
0
def shortcut_search_query(user_input_query, user_input_program, user_input_device):
    index = nmslib.init(method='hnsw', space='cosinesimil')
    index.loadIndex('src/models/sparse_index_word2vec_shortcut_search.bin', load_data=True)

    ids = shortcut_query(user_input_query, user_input_program, user_input_device, index)
    return ids
Ejemplo n.º 30
0
                 (bm25.doc_len[i] / bm25.avgdl)) + bm25.doc_freqs[i][word])
        weighted_vector = vector * weight
        doc_vector.append(weighted_vector)
    doc_vector_mean = np.mean(doc_vector, axis=0)
    weighted_doc_vects.append(doc_vector_mean)

# Save vectors
pickle.dump(weighted_doc_vects,
            open("models/weighted_doc_vects_" + searchname + "_.p",
                 "wb"))  #save the results to disc

# create a matrix from our document vectors
data = np.vstack(weighted_doc_vects)

# initialize a new index, using a HNSW index on Cosine Similarity
index = nmslib.init(method='hnsw', space='cosinesimil')
index.addDataPointBatch(data)
index.createIndex({'post': 2}, print_progress=True)


# Search function
def besceaSearch(query_text):
    output_list = []
    input = query_text.lower().split()
    query = [ft_model[vec] for vec in input]
    query = np.mean(query, axis=0)
    t0 = time.time()
    ids, distances = index.knnQuery(query, k=return_results_count)
    t1 = time.time()
    print(f'Searched {df.shape[0]} records in {round(t1-t0,4) } seconds \n')
    for i, j in zip(ids, distances):
Ejemplo n.º 31
0
def predict_topk(biosyn,
                 eval_dictionary,
                 eval_queries,
                 topk,
                 score_mode='hybrid',
                 type_given=False):
    """
    Parameters
    ----------
    score_mode : str
        hybrid, dense, sparse
    """
    encoder = biosyn.get_dense_encoder()
    tokenizer = biosyn.get_dense_tokenizer()
    sparse_encoder = biosyn.get_sparse_encoder()
    sparse_weight = biosyn.get_sparse_weight().item()  # must be scalar value

    # useful if we're conditioning on types
    all_indv_types = [x for t in eval_dictionary[:, 1] for x in t.split('|')]
    unique_types = np.unique(all_indv_types).tolist()
    v_check_type = np.vectorize(check_label)
    inv_idx = {
        t: v_check_type(eval_dictionary[:, 1], t).nonzero()[0]
        for t in unique_types
    }

    # embed dictionary
    dict_sparse_embeds = biosyn.embed_sparse(names=eval_dictionary[:, 0],
                                             show_progress=True)
    dict_dense_embeds = biosyn.embed_dense(names=eval_dictionary[:, 0],
                                           show_progress=True)

    # build the sparse index
    if not type_given:
        sparse_index = nmslib.init(method='hnsw',
                                   space='negdotprod_sparse_fast',
                                   data_type=nmslib.DataType.SPARSE_VECTOR)
        sparse_index.addDataPointBatch(dict_sparse_embeds)
        sparse_index.createIndex({'post': 2}, print_progress=False)
    else:
        sparse_index = {}
        for sty, indices in inv_idx.items():
            sparse_index[sty] = nmslib.init(
                method='hnsw',
                space='negdotprod_sparse_fast',
                data_type=nmslib.DataType.SPARSE_VECTOR)
            sparse_index[sty].addDataPointBatch(dict_sparse_embeds[indices])
            sparse_index[sty].createIndex({'post': 2}, print_progress=False)

    # build the dense index
    d = dict_dense_embeds.shape[1]
    if not type_given:
        nembeds = dict_dense_embeds.shape[0]
        if nembeds < 10000:  # if the number of embeddings is small, don't approximate
            dense_index = faiss.IndexFlatIP(d)
            dense_index.add(dict_dense_embeds)
        else:
            nlist = int(math.floor(
                math.sqrt(nembeds)))  # number of quantized cells
            nprobe = int(math.floor(
                math.sqrt(nlist)))  # number of the quantized cells to probe
            quantizer = faiss.IndexFlatIP(d)
            dense_index = faiss.IndexIVFFlat(quantizer, d, nlist,
                                             faiss.METRIC_INNER_PRODUCT)
            dense_index.train(dict_dense_embeds)
            dense_index.add(dict_dense_embeds)
            dense_index.nprobe = nprobe
    else:
        dense_index = {}
        for sty, indices in inv_idx.items():
            sty_dict_dense_embeds = dict_dense_embeds[indices]
            nembeds = sty_dict_dense_embeds.shape[0]
            if nembeds < 10000:  # if the number of embeddings is small, don't approximate
                dense_index[sty] = faiss.IndexFlatIP(d)
                dense_index[sty].add(sty_dict_dense_embeds)
            else:
                nlist = int(math.floor(
                    math.sqrt(nembeds)))  # number of quantized cells
                nprobe = int(math.floor(math.sqrt(
                    nlist)))  # number of the quantized cells to probe
                quantizer = faiss.IndexFlatIP(d)
                dense_index[sty] = faiss.IndexIVFFlat(
                    quantizer, d, nlist, faiss.METRIC_INNER_PRODUCT)
                dense_index[sty].train(sty_dict_dense_embeds)
                dense_index[sty].add(sty_dict_dense_embeds)
                dense_index[sty].nprobe = nprobe

    # respond to mention queries
    queries = []
    for eval_query in tqdm(eval_queries, total=len(eval_queries)):
        mentions = eval_query[0].replace("+", "|").split("|")
        golden_cui = eval_query[1].replace("+", "|")
        golden_sty = eval_query[2].replace("+", "|")
        pmid = eval_query[3]
        start_char = eval_query[4]
        end_char = eval_query[5]

        dict_mentions = []
        for mention in mentions:

            mention_sparse_embeds = biosyn.embed_sparse(
                names=np.array([mention]))
            mention_dense_embeds = biosyn.embed_dense(
                names=np.array([mention]))

            # search the sparse index
            if not type_given:
                sparse_nn = sparse_index.knnQueryBatch(mention_sparse_embeds,
                                                       k=topk,
                                                       num_threads=20)
            else:
                sparse_nn = sparse_index[golden_sty].knnQueryBatch(
                    mention_sparse_embeds, k=topk, num_threads=20)
            sparse_idxs, _ = zip(*sparse_nn)
            s_candidate_idxs = np.asarray(sparse_idxs)
            if type_given:
                # reverse mask index mapping
                s_candidate_idxs = inv_idx[golden_sty][s_candidate_idxs]
            s_candidate_idxs = s_candidate_idxs.astype(np.int64)

            # search the dense index
            if not type_given:
                _, d_candidate_idxs = dense_index.search(
                    mention_dense_embeds, topk)
            else:
                _, d_candidate_idxs = dense_index[golden_sty].search(
                    mention_dense_embeds, topk)
                # reverse mask index mapping
                d_candidate_idxs = inv_idx[golden_sty][d_candidate_idxs]
            d_candidate_idxs = d_candidate_idxs.astype(np.int64)

            # get the reduced candidate set
            reduced_candidate_idxs = np.unique(
                np.hstack([
                    s_candidate_idxs.reshape(-1, ),
                    d_candidate_idxs.reshape(-1, )
                ]))

            # get score matrix
            sparse_score_matrix = biosyn.get_score_matrix(
                query_embeds=mention_sparse_embeds,
                dict_embeds=dict_sparse_embeds[
                    reduced_candidate_idxs, :]).todense()
            dense_score_matrix = biosyn.get_score_matrix(
                query_embeds=mention_dense_embeds,
                dict_embeds=dict_dense_embeds[reduced_candidate_idxs, :])

            if score_mode == 'hybrid':
                score_matrix = sparse_weight * sparse_score_matrix + dense_score_matrix
            elif score_mode == 'dense':
                score_matrix = dense_score_matrix
            elif score_mode == 'sparse':
                score_matrix = sparse_score_matrix
            else:
                raise NotImplementedError()

            # take care of getting the best indices
            candidate_idxs = biosyn.retrieve_candidate(
                score_matrix=score_matrix, topk=topk)
            candidate_idxs = reduced_candidate_idxs[candidate_idxs]

            np_candidates = eval_dictionary[candidate_idxs].squeeze()
            dict_candidates = []
            for np_candidate in np_candidates:
                dict_candidates.append({
                    'name':
                    np_candidate[0],
                    'sty':
                    np_candidate[1],
                    'cui':
                    np_candidate[2],
                    'label':
                    check_label(np_candidate[2], golden_cui)
                })
            dict_mentions.append({
                'mention': mention,
                'golden_cui': golden_cui,  # golden_cui can be composite cui
                'pmid': pmid,
                'start_char': start_char,
                'end_char': end_char,
                'candidates': dict_candidates
            })
        queries.append({'mentions': dict_mentions})

    result = {'queries': queries}

    return result
Ejemplo n.º 32
0
def load_index(model_name: str):
    import nmslib

    index = nmslib.init(method="hnsw", space="cosinesimil")
    index.loadIndex(filename=f"{base_path()}{model_name}/{index_file_name}")
    return index
Ejemplo n.º 33
0
def test_vector_fresh(fast=True):
    space_type = 'cosinesimil'
    space_param = []
    method_name = 'small_world_rand'
    index_name  = method_name + '.index'
    if os.path.isfile(index_name):
        os.remove(index_name)
    index = nmslib.init(
                             space_type,
                             space_param,
                             method_name,
                             nmslib.DataType.DENSE_VECTOR,
                             nmslib.DistType.FLOAT)

    start = time.time()
    if fast:
        data = read_data_fast('sample_dataset.txt')
        print('data.shape', data.shape)
        positions = nmslib.addDataPointBatch(index, np.arange(len(data), dtype=np.int32), data)
    else:
        for id, data in enumerate(read_data('sample_dataset.txt')):
            pos = nmslib.addDataPoint(index, id, data)
            if id != pos:
                print('id %s != pos %s' % (id, pos))
                sys.exit(1)
    end = time.time()
    print('added data in %s secs' % (end - start))

    print('Let\'s print a few data entries')
    print('We have added %d data points' % nmslib.getDataPointQty(index))

    print("Distance between points (0,0) " + str(nmslib.getDistance(index, 0, 0)));
    print("Distance between points (1,1) " + str(nmslib.getDistance(index, 1, 1)));
    print("Distance between points (0,1) " + str(nmslib.getDistance(index, 0, 1)));
    print("Distance between points (1,0) " + str(nmslib.getDistance(index, 1, 0)));

    for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))):
       print(nmslib.getDataPoint(index, i))

    print('Let\'s invoke the index-build process')

    index_param = ['NN=17', 'efConstruction=50', 'indexThreadQty=4']
    query_time_param = ['efSearch=50']

    nmslib.createIndex(index, index_param)

    print('The index is created')

    nmslib.setQueryTimeParams(index,query_time_param)

    print('Query time parameters are set')

    print("Results for the freshly created index:")

    k = 3

    start = time.time()
    if fast:
        num_threads = 10
        query = read_data_fast('sample_queryset.txt')
        res = nmslib.knnQueryBatch(index, num_threads, k, query)
        for idx, v in enumerate(res):
            print(idx, v)
    else:
        for idx, data in enumerate(read_data('sample_queryset.txt')):
            print(idx, nmslib.knnQuery(index, k, data))
    end = time.time()
    print('querying done in %s secs' % (end - start))

    nmslib.saveIndex(index, index_name)

    print("The index %s is saved" % index_name)

    nmslib.freeIndex(index)
Ejemplo n.º 34
0
    else:
        for i, d in enumerate(tqdm(corpus)):
            vectors.append(d['vector'])

    M = args.M
    efC = args.ef
    num_threads = args.threads
    index_time_params = {
        'M': M,
        'indexThreadQty': num_threads,
        'efConstruction': efC,
        'post': 0
    }
    if args.is_sparse:
        index = nmslib.init(method='hnsw',
                            space='negdotprod_sparse',
                            data_type=nmslib.DataType.SPARSE_VECTOR)
    else:
        index = nmslib.init(method='hnsw',
                            space='negdotprod',
                            data_type=nmslib.DataType.DENSE_VECTOR)
    index.addDataPointBatch(vectors)
    start = time.time()
    index.createIndex(index_time_params, print_progress=True)
    end = time.time()
    index_time = end - start
    print('Index-time parameters', index_time_params)
    print('Indexing time = %f' % index_time)
    index.saveIndex(os.path.join(args.hnsw_index, 'index.bin'), save_data=True)

    metadata = copy.deepcopy(index_time_params)
Ejemplo n.º 35
0
def test_string_fresh(batch=True):
    DATA_STRS = ["xyz", "beagcfa", "cea", "cb",
                 "d", "c", "bdaf", "ddcd",
                 "egbfa", "a", "fba", "bcccfe",
                 "ab", "bfgbfdc", "bcbbgf", "bfbb"
                 ]
    QUERY_STRS = ["abc", "def", "ghik"]
    space_type = 'leven'
    space_param = []
    method_name = 'small_world_rand'
    index_name  = method_name + '.index'

    index = nmslib.init(
                             space_type,
                             space_param,
                             method_name,
                             nmslib.DataType.OBJECT_AS_STRING,
                             nmslib.DistType.INT)

    if batch:
        print('DATA_STRS', DATA_STRS)
        positions = nmslib.addDataPointBatch(index, np.arange(len(DATA_STRS), dtype=np.int32), DATA_STRS)
    else:
        for id, data in enumerate(DATA_STRS):
            nmslib.addDataPoint(index, id, data)

    print('Let\'s print a few data entries')
    print('We have added %d data points' % nmslib.getDataPointQty(index))

    print("Distance between points (0,0) " + str(nmslib.getDistance(index, 0, 0)));
    print("Distance between points (1,1) " + str(nmslib.getDistance(index, 1, 1)));
    print("Distance between points (0,1) " + str(nmslib.getDistance(index, 0, 1)));
    print("Distance between points (1,0) " + str(nmslib.getDistance(index, 1, 0)));

    for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))):
        print(nmslib.getDataPoint(index,i))

    print('Let\'s invoke the index-build process')

    index_param = ['NN=17', 'efConstruction=50', 'indexThreadQty=4']
    query_time_param = ['efSearch=50']

    nmslib.createIndex(index, index_param)
    nmslib.setQueryTimeParams(index, query_time_param)

    print('Query time parameters are set')

    print("Results for the freshly created index:")

    k = 2
    if batch:
        num_threads = 10
        res = nmslib.knnQueryBatch(index, num_threads, k, QUERY_STRS)
    for idx, data in enumerate(QUERY_STRS):
        res = nmslib.knnQuery(index, k, data)
        print(idx, data, res, [DATA_STRS[i] for i in res])

    nmslib.saveIndex(index, index_name)

    print("The index %s is saved" % index_name)

    nmslib.freeIndex(index)
Ejemplo n.º 36
0
 def fit(self, data):
     self.index = nmslib.init(method=self.method, space='cosinesimil')
     self.index.addDataPointBatch(data)
     self.index.createIndex(self.indexparams, print_progress=True)
Ejemplo n.º 37
0
def test_vector_fresh(fast=True):
    space_type = 'cosinesimil'
    space_param = []
    method_name = 'small_world_rand'
    index_name  = method_name + '.index'
    if os.path.isfile(index_name):
        os.remove(index_name)
    index = nmslib.init(
                             space_type,
                             space_param,
                             method_name,
                             nmslib.DataType.DENSE_VECTOR,
                             nmslib.DistType.FLOAT)

    start = time.time()
    if fast:
        data = read_data_fast('sample_dataset.txt')
        print 'data.shape', data.shape
        positions = nmslib.addDataPointBatch(index, np.arange(len(data), dtype=np.int32), data)
    else:
        for id, data in enumerate(read_data('sample_dataset.txt')):
            pos = nmslib.addDataPoint(index, id, data)
	    if id != pos:
                print 'id %s != pos %s' % (id, pos)
		sys.exit(1)
    end = time.time()
    print 'added data in %s secs' % (end - start)

    print 'Let\'s print a few data entries'
    print 'We have added %d data points' % nmslib.getDataPointQty(index)

    for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))):
       print nmslib.getDataPoint(index, i)

    print 'Let\'s invoke the index-build process'

    index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4']
    query_time_param = ['initSearchAttempts=3']

    nmslib.createIndex(index, index_param)

    print 'The index is created'

    nmslib.setQueryTimeParams(index,query_time_param)

    print 'Query time parameters are set'

    print "Results for the freshly created index:"

    k = 3

    start = time.time()
    if fast:
        num_threads = 10
        query = read_data_fast('sample_queryset.txt')
        res = nmslib.knnQueryBatch(index, num_threads, k, query)
        for idx, v in enumerate(res):
            print idx, v
    else:
        for idx, data in enumerate(read_data('sample_queryset.txt')):
            print idx, nmslib.knnQuery(index, k, data)
    end = time.time()
    print 'querying done in %s secs' % (end - start)

    nmslib.saveIndex(index, index_name)

    print "The index %s is saved" % index_name

    nmslib.freeIndex(index)
Ejemplo n.º 38
0
searchTimes = []
reacll = []
k = 100
avgdistances = []
MMAXparam = []
dgraphParam = []
constructionClocks = []
searchClocks = []
clockAlg = []

import nmslib

for example in [(dgraph, MMAX) for dgraph in [0, 1, 2, 3]
                for MMAX in [2, 4, 6, 8, 10, 12]]:

    hnsw = nmslib.init(method='hnsw', space='l2')

    dgraph = example[0]
    MMAX = example[1]

    MMAXparam.append(example[1])
    dgraphParam.append(example[0])

    startClock = time.clock()
    startTime = process_time()
    hnsw.addDataPointBatch(train)
    hnsw.createIndex({'delaunay_type': dgraph, 'M': MMAX})
    end_time = process_time()
    constructionTime = end_time - startTime
    endClock = time.clock()
    constructionClock = endClock - startClock
Ejemplo n.º 39
0
 def _get_index(self, space='cosinesimil'):
     return nmslib.init(method='vptree', space=space)
Ejemplo n.º 40
0
 def testGlobal(self):
     # this is a one line reproduction of https://github.com/nmslib/nmslib/issues/327
     GlobalTestCase.index = nmslib.init()
Ejemplo n.º 41
0
 def _create_vector_space(self, file_path):
     vector_data = self._read_tsv_file(file_path)
     vector_space = nmslib.init(method='hnsw', space='cosinesimil')
     vector_space.addDataPointBatch(vector_data)
     vector_space.createIndex({'post': 2}, print_progress=True)
     return vector_space
Ejemplo n.º 42
0
 def build_advanced_index(self, vecs: 'np.ndarray'):
     import nmslib
     _index = nmslib.init(method=self.method, space=self.space)
     _index.addDataPointBatch(vecs.astype(np.float32))
     _index.createIndex({'post': 2}, print_progress=self.print_progress)
     return _index
Ejemplo n.º 43
0
 def _get_index(self, space='cosinesimil'):
     return nmslib.init(method='sw-graph', space=space)
Ejemplo n.º 44
0
 def fit(self, x):
     self._index = nmslib.init(space=self._metric, method=self._method)
     self._index.addDataPointBatch(x)
     self._index.createIndex(index_params={'efConstruction': 500},
                             print_progress=True)
     self._index.setQueryTimeParams(params={'efSearch': 500})
Ejemplo n.º 45
0
 def load(self):
     self.index = nmslib.init(method='hnsw', space='l2')
     self.index.loadIndex(self.cfg.faceidx_pkl)
     self.lookup_frame = pd.read_csv(self.cfg.framelookup_csv, index_col=0)
Ejemplo n.º 46
0
 def _rebuild_index(self):
     self.index = nmslib.init(method="hnsw", space="cosinesimil")
     self.index.addDataPointBatch(data=self.embs[:self.current_capacity])
     self.index.createIndex(print_progress=self.print_progress)
Ejemplo n.º 47
0
 def _get_index(self, space='cosinesimil'):
     return nmslib.init(method='vptree', space=space)
Ejemplo n.º 48
0
def build_ann_index(feature_vectors):
    print('\nBuilding nmslib index')
    index = nmslib.init(method='hnsw', space='cosinesimil')
    index.addDataPointBatch(feature_vectors)
    index.createIndex({'post': 2}, print_progress=True)
    return index
Ejemplo n.º 49
0
index_time_params = {
    'M': M,
    'indexThreadQty': num_threads,
    'efConstruction': efC,
    'post': 0
}
print('Index-time parameters', index_time_params)

# Number of neighbors
K = 100
space_name = 'l2'

# Intitialize the library, specify the space, the type of the vector and add data points
index = nmslib.init(method='hnsw',
                    space=space_name,
                    data_type=nmslib.DataType.DENSE_VECTOR)

index.addDataPointBatch(features_data)

# Create an index
start = time.time()
index_time_params = {
    'M': M,
    'indexThreadQty': num_threads,
    'efConstruction': efC
}
index.createIndex(index_time_params)
end = time.time()
print('Index-time parameters', index_time_params)
print('Indexing time = %f' % (end - start))
Ejemplo n.º 50
0
 def load(self, fn):
     self._index = nmslib.init(space=self._metric, method=self._method)
     self._index.loadIndex(fn)
     self._index.setQueryTimeParams(params={'efSearch': 500})
Ejemplo n.º 51
0
def bench_sparse_vector(batch=True):
    dim = 20000
    dataset = np.random.binomial(1, 0.01, size=(40000, dim))
    queryset = np.random.binomial(1, 0.009, size=(1000, dim))

    print 'dataset[0]:', [[i, v] for i, v in enumerate(dataset[0]) if v > 0]

    k = 3

    q0 = queryset[0]
    res = []
    for i in range(dataset.shape[0]):
        res.append([i, distance.cosine(q0, dataset[i,:])])
    res.sort(key=lambda x: x[1])
    print 'q0 res', res[:k]

    data_matrix = csr_matrix(dataset, dtype=np.float32)
    query_matrix = csr_matrix(queryset, dtype=np.float32)

    data_to_return = range(dataset.shape[0])
    with TimeIt('building MultiClusterIndex'):
        cp = snn.MultiClusterIndex(data_matrix, data_to_return)

    with TimeIt('knn search'):
        res = cp.search(query_matrix, k=k, return_distance=False)

    print res[:5]
    for i in res[0]:
        print int(i), distance.cosine(q0, dataset[int(i),:])

    #space_type = 'cosinesimil_sparse'
    space_type = 'cosinesimil_sparse_fast'
    space_param = []
    method_name = 'small_world_rand'
    index_name  = method_name + '_sparse.index'
    if os.path.isfile(index_name):
        os.remove(index_name)
    index = nmslib.init(space_type,
                        space_param,
                        method_name,
                        nmslib.DataType.SPARSE_VECTOR,
                        nmslib.DistType.FLOAT)

    if batch:
        with TimeIt('batch add'):
            positions = nmslib.addDataPointBatch(index, np.arange(len(dataset), dtype=np.int32), data_matrix)
        print 'positions', positions
    else:
        d = []
        q = []
        with TimeIt('preparing'):
            for data in dataset:
                d.append([[i, v] for i, v in enumerate(data) if v > 0])
            for data in queryset:
                q.append([[i, v] for i, v in enumerate(data) if v > 0])
        with TimeIt('adding points'):
            for id, data in enumerate(d):
                nmslib.addDataPoint(index, id, data)

    print 'Let\'s invoke the index-build process'

    index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4']
    query_time_param = ['initSearchAttempts=3']

    with TimeIt('building index'):
        nmslib.createIndex(index, index_param)

    print 'The index is created'

    nmslib.setQueryTimeParams(index,query_time_param)

    print 'Query time parameters are set'

    print "Results for the freshly created index:"

    with TimeIt('knn query'):
        if batch:
            num_threads = 10
            res = nmslib.knnQueryBatch(index, num_threads, k, query_matrix)
            for idx, v in enumerate(res):
                if idx < 5:
                    print idx, v
                if idx == 0:
                    for i in v:
                        print 'q0', i, distance.cosine(q0, dataset[i,:])
        else:
            for idx, data in enumerate(q):
                res = nmslib.knnQuery(index, k, data)
                if idx < 5:
                    print idx, res

    nmslib.saveIndex(index, index_name)

    print "The index %s is saved" % index_name

    nmslib.freeIndex(index)
Ejemplo n.º 52
0
 def _get_index(self, space='cosinesimil'):
     return nmslib.init(method='sw-graph', space=space)