Beispiel #1
0
def test_string_fresh():
    DATA_STRS = [
        "xyz", "beagcfa", "cea", "cb", "d", "c", "bdaf", "ddcd", "egbfa", "a",
        "fba", "bcccfe", "ab", "bfgbfdc", "bcbbgf", "bfbb"
    ]
    QUERY_STRS = ["abc", "def", "ghik"]
    space_type = 'leven'
    space_param = []
    method_name = 'small_world_rand'
    index_name = method_name + '.index'
    index = nmslib_vector.init(space_type, space_param, method_name,
                               nmslib_vector.DataType.STRING,
                               nmslib_vector.DistType.INT)
    for id, data in enumerate(DATA_STRS):
        nmslib_vector.addDataPoint(index, id, data)

    index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4']
    query_time_param = ['initSearchAttempts=3']

    nmslib_vector.createIndex(index, index_param)
    nmslib_vector.setQueryTimeParams(index, query_time_param)

    print 'Query time parameters are set'

    print "Results for the freshly created index:"

    k = 2
    for idx, data in enumerate(QUERY_STRS):
        print idx, nmslib_vector.knnQuery(index, k, data)

    nmslib_vector.saveIndex(index, index_name)

    print "The index %s is saved" % index_name

    nmslib_vector.freeIndex(index)
Beispiel #2
0
def test_vector_loaded():
    space_type = 'cosinesimil'
    space_param = []
    method_name = 'small_world_rand'
    index_name  = method_name + '.index'
    index = nmslib_vector.init(
                             space_type,
                             space_param,
                             method_name,
                             nmslib_vector.DataType.VECTOR,
                             nmslib_vector.DistType.FLOAT)

    for id, data in enumerate(read_data('sample_dataset.txt')):
        nmslib_vector.addDataPoint(index, id, data)

    query_time_param = ['initSearchAttempts=3']

    nmslib_vector.loadIndex(index, index_name)

    print "The index %s is loaded" % index_name
  
    nmslib_vector.setQueryTimeParams(index,query_time_param)

    print 'Query time parameters are set'

    print "Results for the loaded index"

    k = 2
    for idx, data in enumerate(read_data('sample_queryset.txt')):
        print idx, nmslib_vector.knnQuery(index, k, data)

    nmslib_vector.freeIndex(index)
Beispiel #3
0
def test_vector_loaded():
    space_type = 'cosinesimil'
    space_param = []
    method_name = 'small_world_rand'
    index_name = method_name + '.index'
    index = nmslib_vector.init(space_type, space_param, method_name,
                               nmslib_vector.DataType.VECTOR,
                               nmslib_vector.DistType.FLOAT)

    for id, data in enumerate(read_data('sample_dataset.txt')):
        nmslib_vector.addDataPoint(index, id, data)

    query_time_param = ['initSearchAttempts=3']

    nmslib_vector.loadIndex(index, index_name)

    print "The index %s is loaded" % index_name

    nmslib_vector.setQueryTimeParams(index, query_time_param)

    print 'Query time parameters are set'

    print "Results for the loaded index"

    k = 2
    for idx, data in enumerate(read_data('sample_queryset.txt')):
        print idx, nmslib_vector.knnQuery(index, k, data)

    nmslib_vector.freeIndex(index)
Beispiel #4
0
    def fit(self, X):
        os.environ['OMP_THREAD_LIMIT'] = '40'
        import nmslib_vector
        if self._method_name == 'vptree':
            # To avoid this issue:
            # terminate called after throwing an instance of 'std::runtime_error'
            # what():  The data size is too small or the bucket size is too big. Select the parameters so that <total # of records> is NOT less than <bucket size> * 1000
            # Aborted (core dumped)
            self._index_param.append('bucketSize=%d' % min(int(X.shape[0] * 0.0005), 1000))
                                        
        self._index = nmslib_vector.init(self._nmslib_metric, [], self._method_name, nmslib_vector.DataType.VECTOR, nmslib_vector.DistType.FLOAT)
    
        for i, x in enumerate(X):
            nmslib_vector.addDataPoint(self._index, i, x.tolist())


        if os.path.exists(self._index_name):
            print "Loading index from file"
            nmslib_vector.loadIndex(self._index, self._index_name)
        else:
            nmslib_vector.createIndex(self._index, self._index_param)
            # nmslib_vector.saveIndex(self._index, self._index_name)

        nmslib_vector.setQueryTimeParams(self._index, self._query_param)

        os.environ['OMP_THREAD_LIMIT'] = '1'
Beispiel #5
0
def load_index(col_name, category, index_version, reindex=False):
    space_type = 'jsdivslow'
    space_param = []
    method_name = 'small_world_rand'
    index_name = 'indexes/' + col_name + '_' + category + index_version + '.index'
    file_exists = isfile(index_name)
    if reindex or not file_exists:
        build_n_save(col_name, category, index_version)

    index = nmslib_vector.init(space_type, space_param, method_name,
                               nmslib_vector.DataType.VECTOR,
                               nmslib_vector.DistType.FLOAT)
    nmslib_index = 'nmslib_index' + index_version
    all_items_in_category = db[col_name].find({
        'categories': category,
        nmslib_index: {
            '$exists': 1
        }
    })
    t1 = time()
    for idx, item in enumerate(all_items_in_category):
        fp = item['fingerprint']
        if type(fp) == list:
            color = fp
        elif type(fp) == dict:
            color = fp['color']
        else:
            print('else')
            continue
        nmslib_vector.addDataPoint(index, idx, color)
        # item_id = item['_id']
        # db[col_name].update_one({'_id':item_id}, {'$set': {'nmslib_index': idx}})
    t2 = time()
    print('addDataPoints took %s secs' % str(t2 - t1))
    # index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=32']
    query_time_param = ['initSearchAttempts=3']
    nmslib_vector.loadIndex(index, index_name)
    print "The index %s is loaded" % index_name
    t3 = time()
    print('createIndex took %s secs' % str(t3 - t2))

    nmslib_vector.setQueryTimeParams(index, query_time_param)

    return index, nmslib_vector
Beispiel #6
0
def create_index(col_name, category):
    space_type = 'jsdivslow'
    space_param = []
    method_name = 'small_world_rand'
    index_name = col_name + '_' + category + '.index'
    index = nmslib_vector.init(space_type, space_param, method_name,
                               nmslib_vector.DataType.VECTOR,
                               nmslib_vector.DistType.FLOAT)

    all_items_in_category = db[col_name].find({'categories': category})
    t1 = time()
    for idx, item in enumerate(all_items_in_category):
        fp = item['fingerprint']
        if type(fp) == list:
            color = fp
        elif type(fp) == dict:
            color = fp['color']
        else:
            print('else')
            continue
        nmslib_vector.addDataPoint(index, idx, color)
        item_id = item['_id']
        db[col_name].update_one({'_id': item_id},
                                {'$set': {
                                    'nmslib_index': idx
                                }})
    t2 = time()
    print('addDataPoints took %s secs' % str(t2 - t1))
    index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=32']
    query_time_param = ['initSearchAttempts=3']
    nmslib_vector.createIndex(index, index_param)
    t3 = time()
    print('createIndex took %s secs' % str(t3 - t2))

    nmslib_vector.setQueryTimeParams(index, query_time_param)

    nmslib_vector.saveIndex(index, index_name)

    return index, nmslib_vector
Beispiel #7
0
def test_string_fresh():
    DATA_STRS = ["xyz", "beagcfa", "cea", "cb",
                  "d", "c", "bdaf", "ddcd",
                  "egbfa", "a", "fba", "bcccfe",
                  "ab", "bfgbfdc", "bcbbgf", "bfbb"
    ]
    QUERY_STRS = ["abc", "def", "ghik"]
    space_type = 'leven'
    space_param = []
    method_name = 'small_world_rand'
    index_name  = method_name + '.index'
    index = nmslib_vector.init(
                             space_type,
                             space_param,
                             method_name,
                             nmslib_vector.DataType.STRING,
                             nmslib_vector.DistType.INT)
    for id, data in enumerate(DATA_STRS):
        nmslib_vector.addDataPoint(index, id, data)

    index_param = ['NN=17', 'initIndexAttempts=3', 'indexThreadQty=4']
    query_time_param = ['initSearchAttempts=3']

    nmslib_vector.createIndex(index, index_param)
    nmslib_vector.setQueryTimeParams(index, query_time_param)

    print 'Query time parameters are set'

    print "Results for the freshly created index:"

    k = 2
    for idx, data in enumerate(QUERY_STRS):
        print idx, nmslib_vector.knnQuery(index, k, data)

    nmslib_vector.saveIndex(index, index_name)

    print "The index %s is saved" % index_name

    nmslib_vector.freeIndex(index)
Beispiel #8
0
def init_index(idx, k=10):
    index_param = ['NN=%d' % k, 'initIndexAttempts=3', 'indexThreadQty=8']
    query_time_param = ['initSearchAttempts=3']
    nmslib_vector.createIndex(idx[0], index_param)
    nmslib_vector.setQueryTimeParams(idx[0], query_time_param)
    idx[2] = k