def test_string_loaded(): DATA_STRS = ["xyz", "beagcfa", "cea", "cb", "d", "c", "bdaf", "ddcd", "egbfa", "a", "fba", "bcccfe", "ab", "bfgbfdc", "bcbbgf", "bfbb" ] QUERY_STRS = ["abc", "def", "ghik"] space_type = 'leven' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.OBJECT_AS_STRING, nmslib.DistType.INT) for id, data in enumerate(DATA_STRS): nmslib.addDataPoint(index, id, data) print 'Let\'s print a few data entries' print 'We have added %d data points' % nmslib.getDataPointQty(index) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print nmslib.getDataPoint(index,i) print 'Let\'s invoke the index-build process' index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] nmslib.loadIndex(index, index_name) print "The index %s is loaded" % index_name nmslib.setQueryTimeParams(index, query_time_param) print 'Query time parameters are set' print "Results for the loaded index:" k = 2 for idx, data in enumerate(QUERY_STRS): print idx, nmslib.knnQuery(index, k, data) nmslib.freeIndex(index)
def test_string(): DATA_STRS = [ "xyz", "beagcfa", "cea", "cb", "d", "c", "bdaf", "ddcd", "egbfa", "a", "fba", "bcccfe", "ab", "bfgbfdc", "bcbbgf", "bfbb" ] QUERY_STRS = ["abc", "def", "ghik"] space_type = 'leven' space_param = [] method_name = 'small_world_rand' method_param = [ 'NN=17', 'initIndexAttempts=3', 'initSearchAttempts=1', 'indexThreadQty=4' ] index = nmslib.initIndex(len(DATA_STRS), space_type, space_param, method_name, method_param, nmslib.DataType.STRING, nmslib.DistType.INT) for pos, data in enumerate(DATA_STRS): #print pos, data nmslib.setData(index, pos, data) nmslib.buildIndex(index) k = 2 for idx, data in enumerate(QUERY_STRS): print idx, nmslib.knnQuery(index, k, data) nmslib.freeIndex(index)
def test_vector(): n = 4500 space_type = 'cosinesimil' space_param = [] method_name = 'small_world_rand' method_param = [ 'NN=17', 'initIndexAttempts=3', 'initSearchAttempts=1', 'indexThreadQty=4' ] index = nmslib.initIndex(n, space_type, space_param, method_name, method_param, nmslib.DataType.VECTOR, nmslib.DistType.FLOAT) for pos, data in enumerate(read_data('sample_dataset.txt')): if pos >= n: break #print pos, data nmslib.setData(index, pos, data) print 'here' nmslib.buildIndex(index) k = 2 for idx, data in enumerate(read_data('sample_queryset.txt')): print idx, nmslib.knnQuery(index, k, data) nmslib.freeIndex(index)
def test_string_fresh(batch=True): DATA_STRS = ["xyz", "beagcfa", "cea", "cb", "d", "c", "bdaf", "ddcd", "egbfa", "a", "fba", "bcccfe", "ab", "bfgbfdc", "bcbbgf", "bfbb" ] QUERY_STRS = ["abc", "def", "ghik"] space_type = 'leven' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.OBJECT_AS_STRING, nmslib.DistType.INT) if batch: print 'DATA_STRS', DATA_STRS positions = nmslib.addDataPointBatch(index, np.arange(len(DATA_STRS), dtype=np.int32), DATA_STRS) else: for id, data in enumerate(DATA_STRS): nmslib.addDataPoint(index, id, data) print 'Let\'s print a few data entries' print 'We have added %d data points' % nmslib.getDataPointQty(index) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print nmslib.getDataPoint(index,i) print 'Let\'s invoke the index-build process' index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] nmslib.createIndex(index, index_param) nmslib.setQueryTimeParams(index, query_time_param) print 'Query time parameters are set' print "Results for the freshly created index:" k = 2 if batch: num_threads = 10 res = nmslib.knnQueryBatch(index, num_threads, k, QUERY_STRS) for idx, data in enumerate(QUERY_STRS): res = nmslib.knnQuery(index, k, data) print idx, data, res, [DATA_STRS[i] for i in res] nmslib.saveIndex(index, index_name) print "The index %s is saved" % index_name nmslib.freeIndex(index)
def test_string_fresh(batch=True): DATA_STRS = [ "xyz", "beagcfa", "cea", "cb", "d", "c", "bdaf", "ddcd", "egbfa", "a", "fba", "bcccfe", "ab", "bfgbfdc", "bcbbgf", "bfbb" ] QUERY_STRS = ["abc", "def", "ghik"] space_type = 'leven' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' index = nmslib.init(space_type, space_param, method_name, nmslib.DataType.OBJECT_AS_STRING, nmslib.DistType.INT) if batch: print 'DATA_STRS', DATA_STRS positions = nmslib.addDataPointBatch( index, np.arange(len(DATA_STRS), dtype=np.int32), DATA_STRS) else: for id, data in enumerate(DATA_STRS): nmslib.addDataPoint(index, id, data) print 'Let\'s print a few data entries' print 'We have added %d data points' % nmslib.getDataPointQty(index) for i in range(0, min(MAX_PRINT_QTY, nmslib.getDataPointQty(index))): print nmslib.getDataPoint(index, i) print 'Let\'s invoke the index-build process' index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] nmslib.createIndex(index, index_param) nmslib.setQueryTimeParams(index, query_time_param) print 'Query time parameters are set' print "Results for the freshly created index:" k = 2 if batch: num_threads = 10 res = nmslib.knnQueryBatch(index, num_threads, k, QUERY_STRS) for idx, data in enumerate(QUERY_STRS): res = nmslib.knnQuery(index, k, data) print idx, data, res, [DATA_STRS[i] for i in res] nmslib.saveIndex(index, index_name) print "The index %s is saved" % index_name nmslib.freeIndex(index)
def test_object_as_string_fresh(batch=True): space_type = 'cosinesimil' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' if os.path.isfile(index_name): os.remove(index_name) index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.OBJECT_AS_STRING, nmslib.DistType.FLOAT) if batch: data = [s for s in read_data_as_string('sample_dataset.txt')] positions = nmslib.addDataPointBatch(index, np.arange(len(data), dtype=np.int32), data) else: for id, data in enumerate(read_data_as_string('sample_dataset.txt')): nmslib.addDataPoint(index, id, data) print('Let\'s print a few data entries') print('We have added %d data points' % nmslib.getDataPointQty(index)) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print(nmslib.getDataPoint(index, i)) print('Let\'s invoke the index-build process') index_param = ['NN=17', 'efConstruction=50', 'indexThreadQty=4'] query_time_param = ['efSearch=50'] nmslib.createIndex(index, index_param) print('The index is created') nmslib.setQueryTimeParams(index,query_time_param) print('Query time parameters are set') print("Results for the freshly created index:") k = 3 for idx, data in enumerate(read_data_as_string('sample_queryset.txt')): print(idx, nmslib.knnQuery(index, k, data)) nmslib.saveIndex(index, index_name) print("The index %s is saved" % index_name) nmslib.freeIndex(index)
def test_object_as_string_fresh(batch=True): space_type = 'cosinesimil' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' if os.path.isfile(index_name): os.remove(index_name) index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.OBJECT_AS_STRING, nmslib.DistType.FLOAT) if batch: data = [s for s in read_data_as_string('sample_dataset.txt')] positions = nmslib.addDataPointBatch(index, np.arange(len(data), dtype=np.int32), data) else: for id, data in enumerate(read_data_as_string('sample_dataset.txt')): nmslib.addDataPoint(index, id, data) print 'Let\'s print a few data entries' print 'We have added %d data points' % nmslib.getDataPointQty(index) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print nmslib.getDataPoint(index, i) print 'Let\'s invoke the index-build process' index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] nmslib.createIndex(index, index_param) print 'The index is created' nmslib.setQueryTimeParams(index,query_time_param) print 'Query time parameters are set' print "Results for the freshly created index:" k = 3 for idx, data in enumerate(read_data_as_string('sample_queryset.txt')): print idx, nmslib.knnQuery(index, k, data) nmslib.saveIndex(index, index_name) print "The index %s is saved" % index_name nmslib.freeIndex(index)
def test_string_loaded(): DATA_STRS = ["xyz", "beagcfa", "cea", "cb", "d", "c", "bdaf", "ddcd", "egbfa", "a", "fba", "bcccfe", "ab", "bfgbfdc", "bcbbgf", "bfbb" ] QUERY_STRS = ["abc", "def", "ghik"] space_type = 'leven' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.OBJECT_AS_STRING, nmslib.DistType.INT) for id, data in enumerate(DATA_STRS): nmslib.addDataPoint(index, id, data) print('Let\'s print a few data entries') print('We have added %d data points' % nmslib.getDataPointQty(index)) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print(nmslib.getDataPoint(index,i)) print('Let\'s invoke the index-build process') index_param = ['NN=17', 'efConstruction=50', 'indexThreadQty=4'] query_time_param = ['efSearch=50'] nmslib.loadIndex(index, index_name) print("The index %s is loaded" % index_name) nmslib.setQueryTimeParams(index, query_time_param) print('Query time parameters are set') print("Results for the loaded index:") k = 2 for idx, data in enumerate(QUERY_STRS): print(idx, nmslib.knnQuery(index, k, data)) nmslib.freeIndex(index)
def test_sparse_vector_fresh(): space_type = 'cosinesimil_sparse' space_param = [] method_name = 'small_world_rand' index_name = method_name + '_sparse.index' if os.path.isfile(index_name): os.remove(index_name) index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.SPARSE_VECTOR, nmslib.DistType.FLOAT) for id, data in enumerate(read_sparse_data('sample_sparse_dataset.txt')): nmslib.addDataPoint(index, id, data) print 'We have added %d data points' % nmslib.getDataPointQty(index) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print nmslib.getDataPoint(index,i) print 'Let\'s invoke the index-build process' index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] nmslib.createIndex(index, index_param) print 'The index is created' nmslib.setQueryTimeParams(index,query_time_param) print 'Query time parameters are set' print "Results for the freshly created index:" k = 3 for idx, data in enumerate(read_sparse_data('sample_sparse_queryset.txt')): print idx, nmslib.knnQuery(index, k, data) nmslib.saveIndex(index, index_name) print "The index %s is saved" % index_name nmslib.freeIndex(index)
def test_sparse_vector_fresh(): space_type = 'cosinesimil_sparse_fast' space_param = [] method_name = 'small_world_rand' index_name = method_name + '_sparse.index' if os.path.isfile(index_name): os.remove(index_name) index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.SPARSE_VECTOR, nmslib.DistType.FLOAT) for id, data in enumerate(read_sparse_data('sample_sparse_dataset.txt')): nmslib.addDataPoint(index, id, data) print('We have added %d data points' % nmslib.getDataPointQty(index)) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print(nmslib.getDataPoint(index,i)) print('Let\'s invoke the index-build process') index_param = ['NN=17', 'efConstruction=50', 'indexThreadQty=4'] query_time_param = ['efSearch=50'] nmslib.createIndex(index, index_param) print('The index is created') nmslib.setQueryTimeParams(index,query_time_param) print('Query time parameters are set') print("Results for the freshly created index:") k = 3 for idx, data in enumerate(read_sparse_data('sample_sparse_queryset.txt')): print(idx, nmslib.knnQuery(index, k, data)) nmslib.saveIndex(index, index_name) print("The index %s is saved" % index_name) nmslib.freeIndex(index)
def test_vector_loaded(): space_type = 'cosinesimil' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) for id, data in enumerate(read_data('sample_dataset.txt')): pos = nmslib.addDataPoint(index, id, data) if id != pos: print 'id %s != pos %s' % (id, pos) sys.exit(1) print 'Let\'s print a few data entries' print 'We have added %d data points' % nmslib.getDataPointQty(index) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print nmslib.getDataPoint(index,i) print 'Let\'s invoke the index-build process' query_time_param = ['initSearchAttempts=3'] nmslib.loadIndex(index, index_name) print "The index %s is loaded" % index_name nmslib.setQueryTimeParams(index,query_time_param) print 'Query time parameters are set' print "Results for the loaded index" k = 2 for idx, data in enumerate(read_data('sample_queryset.txt')): print idx, nmslib.knnQuery(index, k, data) nmslib.freeIndex(index)
def test_vector_loaded(): space_type = 'cosinesimil' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) for id, data in enumerate(read_data('sample_dataset.txt')): pos = nmslib.addDataPoint(index, id, data) if id != pos: print('id %s != pos %s' % (id, pos)) sys.exit(1) print('Let\'s print a few data entries') print('We have added %d data points' % nmslib.getDataPointQty(index)) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print(nmslib.getDataPoint(index,i)) print('Let\'s invoke the index-build process') query_time_param = ['efSearch=50'] nmslib.loadIndex(index, index_name) print("The index %s is loaded" % index_name) nmslib.setQueryTimeParams(index,query_time_param) print('Query time parameters are set') print("Results for the loaded index") k = 2 for idx, data in enumerate(read_data('sample_queryset.txt')): print(idx, nmslib.knnQuery(index, k, data)) nmslib.freeIndex(index)
def test_vector(): n = 4500 space_type = "cosinesimil" space_param = [] method_name = "small_world_rand" method_param = ["NN=17", "initIndexAttempts=3", "initSearchAttempts=1", "indexThreadQty=4"] index = nmslib.initIndex( n, space_type, space_param, method_name, method_param, nmslib.DataType.VECTOR, nmslib.DistType.FLOAT ) for pos, data in enumerate(read_data("sample_dataset.txt")): if pos >= n: break # print pos, data nmslib.setData(index, pos, data) print "here" nmslib.buildIndex(index) k = 2 for idx, data in enumerate(read_data("sample_queryset.txt")): print idx, nmslib.knnQuery(index, k, data) nmslib.freeIndex(index)
def test_string(): DATA_STRS = [ "xyz", "beagcfa", "cea", "cb", "d", "c", "bdaf", "ddcd", "egbfa", "a", "fba", "bcccfe", "ab", "bfgbfdc", "bcbbgf", "bfbb", ] QUERY_STRS = ["abc", "def", "ghik"] space_type = "leven" space_param = [] method_name = "small_world_rand" method_param = ["NN=17", "initIndexAttempts=3", "initSearchAttempts=1", "indexThreadQty=4"] index = nmslib.initIndex( len(DATA_STRS), space_type, space_param, method_name, method_param, nmslib.DataType.STRING, nmslib.DistType.INT ) for pos, data in enumerate(DATA_STRS): # print pos, data nmslib.setData(index, pos, data) nmslib.buildIndex(index) k = 2 for idx, data in enumerate(QUERY_STRS): print idx, nmslib.knnQuery(index, k, data) nmslib.freeIndex(index)
def test_vector_load(fast=True, fast_batch=True, seq=True): space_type = 'cosinesimil' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' if os.path.isfile(index_name): os.remove(index_name) f = '/tmp/foo.txt' if not os.path.isfile(f): print('creating %s' % f) np.savetxt(f, np.random.rand(100000,1000), delimiter="\t") print('done') if fast: index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) with TimeIt('fast add data point'): data = read_data_fast(f) nmslib.addDataPointBatch(index, np.arange(len(data), dtype=np.int32), data) nmslib.freeIndex(index) if fast_batch: index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) with TimeIt('fast_batch add data point'): offset = 0 for data in read_data_fast_batch(f, 10000): nmslib.addDataPointBatch(index, np.arange(len(data), dtype=np.int32) + offset, data) offset += data.shape[0] print('offset', offset) nmslib.freeIndex(index) if seq: index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) with TimeIt('seq add data point'): for id, data in enumerate(read_data(f)): nmslib.addDataPoint(index, id, data) nmslib.freeIndex(index)
def test_vector_load(fast=True, fast_batch=True, seq=True): space_type = 'cosinesimil' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' if os.path.isfile(index_name): os.remove(index_name) f = '/tmp/foo.txt' if not os.path.isfile(f): print 'creating %s' % f np.savetxt(f, np.random.rand(100000,1000), delimiter="\t") print 'done' if fast: index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) with TimeIt('fast add data point'): data = read_data_fast(f) nmslib.addDataPointBatch(index, np.arange(len(data), dtype=np.int32), data) nmslib.freeIndex(index) if fast_batch: index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) with TimeIt('fast_batch add data point'): offset = 0 for data in read_data_fast_batch(f, 10000): nmslib.addDataPointBatch(index, np.arange(len(data), dtype=np.int32) + offset, data) offset += data.shape[0] print 'offset', offset nmslib.freeIndex(index) if seq: index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) with TimeIt('seq add data point'): for id, data in enumerate(read_data(f)): nmslib.addDataPoint(index, id, data) nmslib.freeIndex(index)
def test_vector_fresh(fast=True): space_type = 'cosinesimil' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' if os.path.isfile(index_name): os.remove(index_name) index = nmslib.init( space_type, space_param, method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) start = time.time() if fast: data = read_data_fast('sample_dataset.txt') print 'data.shape', data.shape positions = nmslib.addDataPointBatch(index, np.arange(len(data), dtype=np.int32), data) else: for id, data in enumerate(read_data('sample_dataset.txt')): pos = nmslib.addDataPoint(index, id, data) if id != pos: print 'id %s != pos %s' % (id, pos) sys.exit(1) end = time.time() print 'added data in %s secs' % (end - start) print 'Let\'s print a few data entries' print 'We have added %d data points' % nmslib.getDataPointQty(index) for i in range(0,min(MAX_PRINT_QTY,nmslib.getDataPointQty(index))): print nmslib.getDataPoint(index, i) print 'Let\'s invoke the index-build process' index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] nmslib.createIndex(index, index_param) print 'The index is created' nmslib.setQueryTimeParams(index,query_time_param) print 'Query time parameters are set' print "Results for the freshly created index:" k = 3 start = time.time() if fast: num_threads = 10 query = read_data_fast('sample_queryset.txt') res = nmslib.knnQueryBatch(index, num_threads, k, query) for idx, v in enumerate(res): print idx, v else: for idx, data in enumerate(read_data('sample_queryset.txt')): print idx, nmslib.knnQuery(index, k, data) end = time.time() print 'querying done in %s secs' % (end - start) nmslib.saveIndex(index, index_name) print "The index %s is saved" % index_name nmslib.freeIndex(index)
def freeIndex(self): nmslib.freeIndex(self._index)
def clr_mem(self): nmslib.freeIndex(self.index) self.created = False
def test_vector_fresh(fast=True): space_type = 'cosinesimil' space_param = [] method_name = 'small_world_rand' index_name = method_name + '.index' if os.path.isfile(index_name): os.remove(index_name) index = nmslib.init(space_type, space_param, method_name, nmslib.DataType.DENSE_VECTOR, nmslib.DistType.FLOAT) start = time.time() if fast: data = read_data_fast('sample_dataset.txt') print('data.shape', data.shape) positions = nmslib.addDataPointBatch( index, np.arange(len(data), dtype=np.int32), data) else: for id, data in enumerate(read_data('sample_dataset.txt')): pos = nmslib.addDataPoint(index, id, data) if id != pos: print('id %s != pos %s' % (id, pos)) sys.exit(1) end = time.time() print('added data in %s secs' % (end - start)) print('Let\'s print a few data entries') print('We have added %d data points' % nmslib.getDataPointQty(index)) print("Distance between points (0,0) " + str(nmslib.getDistance(index, 0, 0))) print("Distance between points (1,1) " + str(nmslib.getDistance(index, 1, 1))) print("Distance between points (0,1) " + str(nmslib.getDistance(index, 0, 1))) print("Distance between points (1,0) " + str(nmslib.getDistance(index, 1, 0))) for i in range(0, min(MAX_PRINT_QTY, nmslib.getDataPointQty(index))): print(nmslib.getDataPoint(index, i)) print('Let\'s invoke the index-build process') index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] nmslib.createIndex(index, index_param) print('The index is created') nmslib.setQueryTimeParams(index, query_time_param) print('Query time parameters are set') print("Results for the freshly created index:") k = 3 start = time.time() if fast: num_threads = 10 query = read_data_fast('sample_queryset.txt') res = nmslib.knnQueryBatch(index, num_threads, k, query) for idx, v in enumerate(res): print(idx, v) else: for idx, data in enumerate(read_data('sample_queryset.txt')): print(idx, nmslib.knnQuery(index, k, data)) end = time.time() print('querying done in %s secs' % (end - start)) nmslib.saveIndex(index, index_name) print("The index %s is saved" % index_name) nmslib.freeIndex(index)
def bench_sparse_vector(batch=True): dim = 20000 dataset = np.random.binomial(1, 0.01, size=(40000, dim)) queryset = np.random.binomial(1, 0.009, size=(1000, dim)) print 'dataset[0]:', [[i, v] for i, v in enumerate(dataset[0]) if v > 0] k = 3 q0 = queryset[0] res = [] for i in range(dataset.shape[0]): res.append([i, distance.cosine(q0, dataset[i,:])]) res.sort(key=lambda x: x[1]) print 'q0 res', res[:k] data_matrix = csr_matrix(dataset, dtype=np.float32) query_matrix = csr_matrix(queryset, dtype=np.float32) data_to_return = range(dataset.shape[0]) with TimeIt('building MultiClusterIndex'): cp = snn.MultiClusterIndex(data_matrix, data_to_return) with TimeIt('knn search'): res = cp.search(query_matrix, k=k, return_distance=False) print res[:5] for i in res[0]: print int(i), distance.cosine(q0, dataset[int(i),:]) #space_type = 'cosinesimil_sparse' space_type = 'cosinesimil_sparse_fast' space_param = [] method_name = 'small_world_rand' index_name = method_name + '_sparse.index' if os.path.isfile(index_name): os.remove(index_name) index = nmslib.init(space_type, space_param, method_name, nmslib.DataType.SPARSE_VECTOR, nmslib.DistType.FLOAT) if batch: with TimeIt('batch add'): positions = nmslib.addDataPointBatch(index, np.arange(len(dataset), dtype=np.int32), data_matrix) print 'positions', positions else: d = [] q = [] with TimeIt('preparing'): for data in dataset: d.append([[i, v] for i, v in enumerate(data) if v > 0]) for data in queryset: q.append([[i, v] for i, v in enumerate(data) if v > 0]) with TimeIt('adding points'): for id, data in enumerate(d): nmslib.addDataPoint(index, id, data) print 'Let\'s invoke the index-build process' index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4'] query_time_param = ['initSearchAttempts=3'] with TimeIt('building index'): nmslib.createIndex(index, index_param) print 'The index is created' nmslib.setQueryTimeParams(index,query_time_param) print 'Query time parameters are set' print "Results for the freshly created index:" with TimeIt('knn query'): if batch: num_threads = 10 res = nmslib.knnQueryBatch(index, num_threads, k, query_matrix) for idx, v in enumerate(res): if idx < 5: print idx, v if idx == 0: for i in v: print 'q0', i, distance.cosine(q0, dataset[i,:]) else: for idx, data in enumerate(q): res = nmslib.knnQuery(index, k, data) if idx < 5: print idx, res nmslib.saveIndex(index, index_name) print "The index %s is saved" % index_name nmslib.freeIndex(index)
def bench_sparse_vector(batch=True): # delay importing these so CI can import module from scipy.sparse import csr_matrix from scipy.spatial import distance from pysparnn.cluster_index import MultiClusterIndex dim = 20000 dataset = np.random.binomial(1, 0.01, size=(40000, dim)) queryset = np.random.binomial(1, 0.009, size=(1000, dim)) print('dataset[0]:', [[i, v] for i, v in enumerate(dataset[0]) if v > 0]) k = 3 q0 = queryset[0] res = [] for i in range(dataset.shape[0]): res.append([i, distance.cosine(q0, dataset[i, :])]) res.sort(key=lambda x: x[1]) print('q0 res', res[:k]) data_matrix = csr_matrix(dataset, dtype=np.float32) query_matrix = csr_matrix(queryset, dtype=np.float32) data_to_return = range(dataset.shape[0]) with TimeIt('building MultiClusterIndex'): cp = MultiClusterIndex(data_matrix, data_to_return) with TimeIt('knn search'): res = cp.search(query_matrix, k=k, return_distance=False) print(res[:5]) for i in res[0]: print(int(i), distance.cosine(q0, dataset[int(i), :])) #space_type = 'cosinesimil_sparse' space_type = 'cosinesimil_sparse_fast' space_param = [] method_name = 'small_world_rand' index_name = method_name + '_sparse.index' if os.path.isfile(index_name): os.remove(index_name) index = nmslib.init(space_type, space_param, method_name, nmslib.DataType.SPARSE_VECTOR, nmslib.DistType.FLOAT) if batch: with TimeIt('batch add'): positions = nmslib.addDataPointBatch( index, np.arange(len(dataset), dtype=np.int32), data_matrix) print('positions', positions) else: d = [] q = [] with TimeIt('preparing'): for data in dataset: d.append([[i, v] for i, v in enumerate(data) if v > 0]) for data in queryset: q.append([[i, v] for i, v in enumerate(data) if v > 0]) with TimeIt('adding points'): for id, data in enumerate(d): nmslib.addDataPoint(index, id, data) print('Let\'s invoke the index-build process') index_param = ['NN=17', 'efConstruction=50', 'indexThreadQty=4'] query_time_param = ['efSearch=50'] with TimeIt('building index'): nmslib.createIndex(index, index_param) print('The index is created') nmslib.setQueryTimeParams(index, query_time_param) print('Query time parameters are set') print("Results for the freshly created index:") with TimeIt('knn query'): if batch: num_threads = 10 res = nmslib.knnQueryBatch(index, num_threads, k, query_matrix) for idx, v in enumerate(res): if idx < 5: print(idx, v) if idx == 0: for i in v: print('q0', i, distance.cosine(q0, dataset[i, :])) else: for idx, data in enumerate(q): res = nmslib.knnQuery(index, k, data) if idx < 5: print(idx, res) nmslib.saveIndex(index, index_name) print("The index %s is saved" % index_name) nmslib.freeIndex(index)
def test_save_and_load(data, init_nn=3, init_index=3, init_search=3): import nmslib reload(nmslib) n = data.shape[0] space_type = 'l2' space_param = [] method_name = 'small_world_rand' method_param = ['NN=%d'%init_nn, 'initIndexAttempts=%d'%init_index, 'initSearchAttempts=%d'%init_search, 'indexThreadQty=4', 'graphFileName=savedGraph.txt', 'saveGraphFile=1', 'loadGraphFile=0'] index = nmslib.initIndex(n, space_type, space_param, method_name, method_param, nmslib.DataType.VECTOR, nmslib.DistType.FLOAT) t0 = time.time() for pos, d in enumerate(data): nmslib.setData(index, pos, d.tolist()) nmslib.buildIndex(index) print 'Building %i dataset took %1.4f' % (data.shape[0], time.time()-t0) def query(q, k=10, m=3): return nmslib.knnQuery(index, k, q.tolist()) print 'building score: ' print test_method(query) nmslib.freeIndex(index) method_param = ['NN=%d'%init_nn, 'initIndexAttempts=%d'%init_index, 'initSearchAttempts=%d'%init_search, 'indexThreadQty=1', 'graphFileName=savedGraph.txt', 'saveGraphFile=0', 'loadGraphFile=1'] index2 = nmslib.initIndex(n, space_type, space_param, method_name, method_param, nmslib.DataType.VECTOR, nmslib.DistType.FLOAT) t0 = time.time() for pos, d in enumerate(data): nmslib.setData(index2, pos, d.tolist()) nmslib.buildIndex(index2) print 'Building %i dataset took %1.4f' % (data.shape[0], time.time()-t0) def query2(q, k=10, m=3): return nmslib.knnQuery(index2, k, q.tolist()) print 'loading score: ' print test_method(query2) nmslib.freeIndex(index2)
def freeIndex(self): import nmslib nmslib.freeIndex(self._index)