def test_string_loaded(): DATA_STRS = ["xyz", "beagcfa", "cea", "cb", "d", "c", "bdaf", "ddcd", "egbfa", "a", "fba", "bcccfe", "ab", "bfgbfdc", "bcbbgf", "bfbb" ] QUERY_STRS = ["abc", "def", "ghik"] space_type = 'leven' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.OBJECT_AS_STRING, nmslib.DistType.INT) for id, data in enumerate(DATA_STRS): nmslib.addDataPoint(index, id, data) print 'Let\'s print a few data entries' print 'We have added %d data points' % nmslib.getDataPointQty(index) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print nmslib.getDataPoint(index,i) print 'Let\'s invoke the index-build process' index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] nmslib.loadIndex(index, index_name) print "The index %s is loaded" % index_name nmslib.setQueryTimeParams(index, query_time_param) print 'Query time parameters are set' print "Results for the loaded index:" k = 2 for idx, data in enumerate(QUERY_STRS): print idx, nmslib.knnQuery(index, k, data) nmslib.freeIndex(index)
def test_string(): DATA_STRS = [ "xyz", "beagcfa", "cea", "cb", "d", "c", "bdaf", "ddcd", "egbfa", "a", "fba", "bcccfe", "ab", "bfgbfdc", "bcbbgf", "bfbb" ] QUERY_STRS = ["abc", "def", "ghik"] space_type = 'leven' space_param = [] method_name = 'small_world_rand' method_param = [ 'NN=17', 'initIndexAttempts=3', 'initSearchAttempts=1', 'indexThreadQty=4' ] index = nmslib.initIndex(len(DATA_STRS), space_type, space_param, method_name, method_param, nmslib.DataType.STRING, nmslib.DistType.INT) for pos, data in enumerate(DATA_STRS): #print pos, data nmslib.setData(index, pos, data) nmslib.buildIndex(index) k = 2 for idx, data in enumerate(QUERY_STRS): print idx, nmslib.knnQuery(index, k, data) nmslib.freeIndex(index)
def test_vector(): n = 4500 space_type = 'cosinesimil' space_param = [] method_name = 'small_world_rand' method_param = [ 'NN=17', 'initIndexAttempts=3', 'initSearchAttempts=1', 'indexThreadQty=4' ] index = nmslib.initIndex(n, space_type, space_param, method_name, method_param, nmslib.DataType.VECTOR, nmslib.DistType.FLOAT) for pos, data in enumerate(read_data('sample_dataset.txt')): if pos >= n: break #print pos, data nmslib.setData(index, pos, data) print 'here' nmslib.buildIndex(index) k = 2 for idx, data in enumerate(read_data('sample_queryset.txt')): print idx, nmslib.knnQuery(index, k, data) nmslib.freeIndex(index)
def test_string_fresh(batch=True): DATA_STRS = ["xyz", "beagcfa", "cea", "cb", "d", "c", "bdaf", "ddcd", "egbfa", "a", "fba", "bcccfe", "ab", "bfgbfdc", "bcbbgf", "bfbb" ] QUERY_STRS = ["abc", "def", "ghik"] space_type = 'leven' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.OBJECT_AS_STRING, nmslib.DistType.INT) if batch: print 'DATA_STRS', DATA_STRS positions = nmslib.addDataPointBatch(index, np.arange(len(DATA_STRS), dtype=np.int32), DATA_STRS) else: for id, data in enumerate(DATA_STRS): nmslib.addDataPoint(index, id, data) print 'Let\'s print a few data entries' print 'We have added %d data points' % nmslib.getDataPointQty(index) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print nmslib.getDataPoint(index,i) print 'Let\'s invoke the index-build process' index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] nmslib.createIndex(index, index_param) nmslib.setQueryTimeParams(index, query_time_param) print 'Query time parameters are set' print "Results for the freshly created index:" k = 2 if batch: num_threads = 10 res = nmslib.knnQueryBatch(index, num_threads, k, QUERY_STRS) for idx, data in enumerate(QUERY_STRS): res = nmslib.knnQuery(index, k, data) print idx, data, res, [DATA_STRS[i] for i in res] nmslib.saveIndex(index, index_name) print "The index %s is saved" % index_name nmslib.freeIndex(index)
def test_object_as_string_fresh(batch=True): space_type = 'cosinesimil' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' if os.path.isfile(index_name): os.remove(index_name) index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.OBJECT_AS_STRING, nmslib.DistType.FLOAT) if batch: data = [s for s in read_data_as_string('sample_dataset.txt')] positions = nmslib.addDataPointBatch(index, np.arange(len(data), dtype=np.int32), data) else: for id, data in enumerate(read_data_as_string('sample_dataset.txt')): nmslib.addDataPoint(index, id, data) print 'Let\'s print a few data entries' print 'We have added %d data points' % nmslib.getDataPointQty(index) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print nmslib.getDataPoint(index, i) print 'Let\'s invoke the index-build process' index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] nmslib.createIndex(index, index_param) print 'The index is created' nmslib.setQueryTimeParams(index,query_time_param) print 'Query time parameters are set' print "Results for the freshly created index:" k = 3 for idx, data in enumerate(read_data_as_string('sample_queryset.txt')): print idx, nmslib.knnQuery(index, k, data) nmslib.saveIndex(index, index_name) print "The index %s is saved" % index_name nmslib.freeIndex(index)
def test_sparse_vector_fresh(): space_type = 'cosinesimil_sparse' space_param = [] method_name = 'small_world_rand' index_name = method_name + '_sparse.index' if os.path.isfile(index_name): os.remove(index_name) index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.SPARSE_VECTOR, nmslib.DistType.FLOAT) for id, data in enumerate(read_sparse_data('sample_sparse_dataset.txt')): nmslib.addDataPoint(index, id, data) print 'We have added %d data points' % nmslib.getDataPointQty(index) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print nmslib.getDataPoint(index,i) print 'Let\'s invoke the index-build process' index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] nmslib.createIndex(index, index_param) print 'The index is created' nmslib.setQueryTimeParams(index,query_time_param) print 'Query time parameters are set' print "Results for the freshly created index:" k = 3 for idx, data in enumerate(read_sparse_data('sample_sparse_queryset.txt')): print idx, nmslib.knnQuery(index, k, data) nmslib.saveIndex(index, index_name) print "The index %s is saved" % index_name nmslib.freeIndex(index)
def test_vector_loaded(): space_type = 'cosinesimil' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) for id, data in enumerate(read_data('sample_dataset.txt')): pos = nmslib.addDataPoint(index, id, data) if id != pos: print 'id %s != pos %s' % (id, pos) sys.exit(1) print 'Let\'s print a few data entries' print 'We have added %d data points' % nmslib.getDataPointQty(index) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print nmslib.getDataPoint(index,i) print 'Let\'s invoke the index-build process' query_time_param = ['initSearchAttempts=3'] nmslib.loadIndex(index, index_name) print "The index %s is loaded" % index_name nmslib.setQueryTimeParams(index,query_time_param) print 'Query time parameters are set' print "Results for the loaded index" k = 2 for idx, data in enumerate(read_data('sample_queryset.txt')): print idx, nmslib.knnQuery(index, k, data) nmslib.freeIndex(index)
def test_vector_loaded(): space_type = 'cosinesimil' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) for id, data in enumerate(read_data('sample_dataset.txt')): pos = nmslib.addDataPoint(index, id, data) if id != pos: print('id %s != pos %s' % (id, pos)) sys.exit(1) print('Let\'s print a few data entries') print('We have added %d data points' % nmslib.getDataPointQty(index)) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print(nmslib.getDataPoint(index,i)) print('Let\'s invoke the index-build process') query_time_param = ['efSearch=50'] nmslib.loadIndex(index, index_name) print("The index %s is loaded" % index_name) nmslib.setQueryTimeParams(index,query_time_param) print('Query time parameters are set') print("Results for the loaded index") k = 2 for idx, data in enumerate(read_data('sample_queryset.txt')): print(idx, nmslib.knnQuery(index, k, data)) nmslib.freeIndex(index)
def test_vector(): n = 4500 space_type = "cosinesimil" space_param = [] method_name = "small_world_rand" method_param = ["NN=17", "initIndexAttempts=3", "initSearchAttempts=1", "indexThreadQty=4"] index = nmslib.initIndex( n, space_type, space_param, method_name, method_param, nmslib.DataType.VECTOR, nmslib.DistType.FLOAT ) for pos, data in enumerate(read_data("sample_dataset.txt")): if pos >= n: break # print pos, data nmslib.setData(index, pos, data) print "here" nmslib.buildIndex(index) k = 2 for idx, data in enumerate(read_data("sample_queryset.txt")): print idx, nmslib.knnQuery(index, k, data) nmslib.freeIndex(index)
def test_string(): DATA_STRS = [ "xyz", "beagcfa", "cea", "cb", "d", "c", "bdaf", "ddcd", "egbfa", "a", "fba", "bcccfe", "ab", "bfgbfdc", "bcbbgf", "bfbb", ] QUERY_STRS = ["abc", "def", "ghik"] space_type = "leven" space_param = [] method_name = "small_world_rand" method_param = ["NN=17", "initIndexAttempts=3", "initSearchAttempts=1", "indexThreadQty=4"] index = nmslib.initIndex( len(DATA_STRS), space_type, space_param, method_name, method_param, nmslib.DataType.STRING, nmslib.DistType.INT ) for pos, data in enumerate(DATA_STRS): # print pos, data nmslib.setData(index, pos, data) nmslib.buildIndex(index) k = 2 for idx, data in enumerate(QUERY_STRS): print idx, nmslib.knnQuery(index, k, data) nmslib.freeIndex(index)
def bench_sparse_vector(batch=True): dim = 20000 dataset = np.random.binomial(1, 0.01, size=(40000, dim)) queryset = np.random.binomial(1, 0.009, size=(1000, dim)) print 'dataset[0]:', [[i, v] for i, v in enumerate(dataset[0]) if v > 0] k = 3 q0 = queryset[0] res = [] for i in range(dataset.shape[0]): res.append([i, distance.cosine(q0, dataset[i,:])]) res.sort(key=lambda x: x[1]) print 'q0 res', res[:k] data_matrix = csr_matrix(dataset, dtype=np.float32) query_matrix = csr_matrix(queryset, dtype=np.float32) data_to_return = range(dataset.shape[0]) with TimeIt('building MultiClusterIndex'): cp = snn.MultiClusterIndex(data_matrix, data_to_return) with TimeIt('knn search'): res = cp.search(query_matrix, k=k, return_distance=False) print res[:5] for i in res[0]: print int(i), distance.cosine(q0, dataset[int(i),:]) #space_type = 'cosinesimil_sparse' space_type = 'cosinesimil_sparse_fast' space_param = [] method_name = 'small_world_rand' index_name = method_name + '_sparse.index' if os.path.isfile(index_name): os.remove(index_name) index = nmslib.init(space_type, space_param, method_name, nmslib.DataType.SPARSE_VECTOR, nmslib.DistType.FLOAT) if batch: with TimeIt('batch add'): positions = nmslib.addDataPointBatch(index, np.arange(len(dataset), dtype=np.int32), data_matrix) print 'positions', positions else: d = [] q = [] with TimeIt('preparing'): for data in dataset: d.append([[i, v] for i, v in enumerate(data) if v > 0]) for data in queryset: q.append([[i, v] for i, v in enumerate(data) if v > 0]) with TimeIt('adding points'): for id, data in enumerate(d): nmslib.addDataPoint(index, id, data) print 'Let\'s invoke the index-build process' index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] with TimeIt('building index'): nmslib.createIndex(index, index_param) print 'The index is created' nmslib.setQueryTimeParams(index,query_time_param) print 'Query time parameters are set' print "Results for the freshly created index:" with TimeIt('knn query'): if batch: num_threads = 10 res = nmslib.knnQueryBatch(index, num_threads, k, query_matrix) for idx, v in enumerate(res): if idx < 5: print idx, v if idx == 0: for i in v: print 'q0', i, distance.cosine(q0, dataset[i,:]) else: for idx, data in enumerate(q): res = nmslib.knnQuery(index, k, data) if idx < 5: print idx, res nmslib.saveIndex(index, index_name) print "The index %s is saved" % index_name nmslib.freeIndex(index)
def test_vector_fresh(fast=True): space_type = 'cosinesimil' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' if os.path.isfile(index_name): os.remove(index_name) index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) start = time.time() if fast: data = read_data_fast('sample_dataset.txt') print 'data.shape', data.shape positions = nmslib.addDataPointBatch(index, np.arange(len(data), dtype=np.int32), data) else: for id, data in enumerate(read_data('sample_dataset.txt')): pos = nmslib.addDataPoint(index, id, data) if id != pos: print 'id %s != pos %s' % (id, pos) sys.exit(1) end = time.time() print 'added data in %s secs' % (end - start) print 'Let\'s print a few data entries' print 'We have added %d data points' % nmslib.getDataPointQty(index) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print nmslib.getDataPoint(index, i) print 'Let\'s invoke the index-build process' index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] nmslib.createIndex(index, index_param) print 'The index is created' nmslib.setQueryTimeParams(index,query_time_param) print 'Query time parameters are set' print "Results for the freshly created index:" k = 3 start = time.time() if fast: num_threads = 10 query = read_data_fast('sample_queryset.txt') res = nmslib.knnQueryBatch(index, num_threads, k, query) for idx, v in enumerate(res): print idx, v else: for idx, data in enumerate(read_data('sample_queryset.txt')): print idx, nmslib.knnQuery(index, k, data) end = time.time() print 'querying done in %s secs' % (end - start) nmslib.saveIndex(index, index_name) print "The index %s is saved" % index_name nmslib.freeIndex(index)
def query2(q, k=10, m=3): return nmslib.knnQuery(index2, k, q.tolist())
def query(self, v, n): import nmslib return nmslib.knnQuery(self._index, n, v.tolist())