Exemple #1
0
def reset(args):
    if args.force or input('Type "yes" to delete ALL data: ') == 'yes':
        DB.flushdb()
        DS.flushdb()
        print('All data has been deleted.')
    else:
        print('Nothing has been deleted.')
Exemple #2
0
def only_commons(helper):
    if len(helper.tokens) == len(helper.common):
        # Only common terms, shortcut to search
        keys = [t.db_key for t in helper.tokens]
        if helper.geohash_key:
            keys.append(helper.geohash_key)
            helper.debug('Adding geohash %s', helper.geohash_key)
        if len(keys) == 1 or helper.geohash_key:
            helper.add_to_bucket(keys)
        if helper.bucket_dry and len(keys) > 1:
            count = 0
            # Scan the less frequent token.
            helper.tokens.sort(key=lambda t: t.frequency)
            first = helper.tokens[0]
            if first.frequency < config.INTERSECT_LIMIT:
                helper.debug('Under INTERSECT_LIMIT, brut force.')
                keys = [t.db_key for t in helper.tokens]
                helper.add_to_bucket(keys)
            else:
                helper.debug('INTERSECT_LIMIT hit, manual scan on %s', first)
                others = [t.db_key for t in helper.tokens[1:]]
                ids = DB.zrevrange(first.db_key, 0, 500)
                for id_ in ids:
                    count += 1
                    if all(DB.sismember(f, id_) for f in helper.filters) \
                       and all(DB.zrank(k, id_) for k in others):
                        helper.bucket.add(id_)
                    if helper.bucket_full:
                        break
                helper.debug('%s results after scan (%s loops)',
                             len(helper.bucket), count)
Exemple #3
0
def only_commons(helper):
    if len(helper.tokens) == len(helper.common):
        # Only common terms, shortcut to search
        keys = [t.db_key for t in helper.tokens]
        if helper.geohash_key:
            keys.append(helper.geohash_key)
            helper.debug('Adding geohash %s', helper.geohash_key)
        if len(keys) == 1 or helper.geohash_key:
            helper.add_to_bucket(keys)
        if helper.bucket_dry and len(keys) > 1:
            count = 0
            # Scan the less frequent token.
            helper.tokens.sort(key=lambda t: t.frequency)
            first = helper.tokens[0]
            if first.frequency < config.INTERSECT_LIMIT:
                helper.debug('Under INTERSECT_LIMIT, brut force.')
                keys = [t.db_key for t in helper.tokens]
                helper.add_to_bucket(keys)
            else:
                helper.debug('INTERSECT_LIMIT hit, manual scan on %s', first)
                others = [t.db_key for t in helper.tokens[1:]]
                ids = DB.zrevrange(first.db_key, 0, 500)
                for id_ in ids:
                    count += 1
                    if all(DB.sismember(f, id_) for f in helper.filters) \
                       and all(DB.zrank(k, id_) for k in others):
                        helper.bucket.add(id_)
                    if helper.bucket_full:
                        break
                helper.debug('%s results after scan (%s loops)',
                             len(helper.bucket), count)
Exemple #4
0
def test_index_edge_ngrams():
    before = count_keys()
    index_edge_ngrams(DB, 'street')
    after = count_keys()
    assert after - before == 3
    assert DB.smembers('n|str') == set([b'street'])
    assert DB.smembers('n|stre') == set([b'street'])
    assert DB.smembers('n|stree') == set([b'street'])
Exemple #5
0
def test_index_edge_ngrams():
    before = count_keys()
    index_edge_ngrams(DB, 'street')
    after = count_keys()
    assert after - before == 3
    assert DB.smembers('n|str') == set([b'street'])
    assert DB.smembers('n|stre') == set([b'street'])
    assert DB.smembers('n|stree') == set([b'street'])
Exemple #6
0
def test_force_reset(factory):
    class Args:
        force = True

    factory(name="rue de l'avoine")
    assert DB.keys()
    reset(Args())
    assert not DB.keys()
def test_index_document_with_skip_digit_false(config):
    from addok.helpers.index import _CACHE
    _CACHE.clear()  # Do this in addok.pytest teardown?
    config.TRIGRAM_SKIP_DIGIT = False
    index_document(DOC.copy())
    assert DB.exists('w|123')
    assert DB.exists('w|234')
    assert DB.exists('w|345')
    assert len(DB.keys()) == 17
Exemple #8
0
def test_allow_list_values():
    doc = {
        'id': 'xxxx',
        'type': 'street',
        'name': ['Vernou-la-Celle-sur-Seine', 'Vernou'],
        'city': 'Paris',
        'lat': '49.32545',
        'lon': '4.2565'
    }
    index_document(doc)
    assert DB.zscore('w|vernou', 'd|xxxx') == 4
    assert DB.zscore('w|celle', 'd|xxxx') == 4 / 5
Exemple #9
0
def test_allow_list_values():
    doc = {
        'id': 'xxxx',
        'type': 'street',
        'name': ['Vernou-la-Celle-sur-Seine', 'Vernou'],
        'city': 'Paris',
        'lat': '49.32545',
        'lon': '4.2565'
    }
    index_document(doc)
    assert DB.zscore('w|vernou', 'd|xxxx') == 4
    assert DB.zscore('w|celle', 'd|xxxx') == 4 / 5
Exemple #10
0
def test_reset(factory, monkeypatch):
    class Args:
        force = False

    factory(name="rue de l'avoine")
    assert DB.keys()
    monkeypatch.setitem(__builtins__, 'input', lambda *args, **kwargs: 'no')
    reset(Args())
    assert DB.keys()
    monkeypatch.setitem(__builtins__, 'input', lambda *args, **kwargs: 'yes')
    reset(Args())
    assert not DB.keys()
Exemple #11
0
def test_deindex_document_should_deindex_list_values():
    doc = {
        'id': 'xxxx',
        'type': 'street',
        'name': ['Vernou-la-Celle-sur-Seine', 'Vernou'],
        'city': 'Paris',
        'lat': '49.32545',
        'lon': '4.2565'
    }
    index_document(doc)
    deindex_document(doc['id'])
    assert not ds._DB.exists('d|xxxx')
    assert not DB.exists('w|vernou')
    assert not DB.exists('w|celle')
    assert len(DB.keys()) == 0
Exemple #12
0
def test_deindex_document_should_deindex_list_values():
    doc = {
        'id': 'xxxx',
        'type': 'street',
        'name': ['Vernou-la-Celle-sur-Seine', 'Vernou'],
        'city': 'Paris',
        'lat': '49.32545',
        'lon': '4.2565'
    }
    index_document(doc)
    deindex_document(doc['id'])
    assert not DB.exists('d|xxxx')
    assert not DB.exists('w|vernou')
    assert not DB.exists('w|celle')
    assert len(DB.keys()) == 0
Exemple #13
0
def only_commons(helper):
    if len(helper.tokens) == len(helper.common):
        # Only common terms, shortcut to search
        keys = [t.db_key for t in helper.tokens]
        if helper.geohash_key:
            keys.append(helper.geohash_key)
            helper.debug('Adding geohash %s', helper.geohash_key)
        if len(keys) == 1 or helper.geohash_key:
            helper.add_to_bucket(keys)
        if helper.bucket_dry and len(keys) > 1:
            # Scan the less frequent token.
            helper.tokens.sort(key=lambda t: t.frequency)
            keys = [t.db_key for t in helper.tokens]
            first = helper.tokens[0]
            if first.frequency < config.INTERSECT_LIMIT:
                helper.debug('Under INTERSECT_LIMIT, force intersect.')
                helper.add_to_bucket(keys)
            else:
                helper.debug('INTERSECT_LIMIT hit, manual scan')
                if helper.filters:
                    # Always consider filters when doing manual intersect.
                    keys = keys + helper.filters
                    # But, hey, can we brute force again?
                    if any(
                            DB.scard(k) < config.INTERSECT_LIMIT
                            for k in helper.filters):
                        helper.debug('Filters under INTERSECT_LIMIT, force')
                        helper.add_to_bucket(keys)
                        return
                helper.debug('manual scan on "%s"', first)
                ids = scripts.manual_scan(keys=keys, args=[helper.min])
                helper.bucket.update(ids)
                helper.debug('%s results after scan', len(helper.bucket))
Exemple #14
0
def test_index_document_without_explicit_id():
    doc = DOC.copy()
    del doc['_id']
    index_document(doc)
    assert ds._DB.exists('d|jR')
    assert ds._DB.type('d|jR') == b'string'
    assert DB.exists('w|rue')
Exemple #15
0
def test_should_be_possible_to_override_boost_with_callable(config):
    config.FIELDS = [
        {'key': 'name', 'boost': lambda doc: 5},
        {'key': 'city'},
    ]
    doc = {
        'id': 'xxxx',
        'lat': '49.32545',
        'lon': '4.2565',
        'name': 'Lilas',
        'city': 'Cergy'
    }
    index_document(doc)
    assert DB.exists('d|xxxx')
    assert DB.zscore('w|lilas', 'd|xxxx') == 5
    assert DB.zscore('w|cergy', 'd|xxxx') == 1
Exemple #16
0
def deindex_document(id_, **kwargs):
    key = document_key(id_)
    doc = DB.hgetall(key)
    if not doc:
        return
    tokens = []
    for indexer in config.DEINDEXERS:
        indexer(DB, key, doc, tokens, **kwargs)
Exemple #17
0
def deindex_document(id_, **kwargs):
    key = keys.document_key(id_)
    doc = DB.hgetall(key)
    if not doc:
        return
    tokens = []
    for indexer in config.DEINDEXERS:
        indexer(DB, key, doc, tokens, **kwargs)
Exemple #18
0
def pair(word):
    """See all token associated with a given token.
    PAIR lilas"""
    word = list(preprocess_query(word))[0]
    key = pair_key(word)
    tokens = [t.decode() for t in DB.smembers(key)]
    tokens.sort()
    print(white(tokens))
    print(magenta('(Total: {})'.format(len(tokens))))
Exemple #19
0
def test_should_be_possible_to_define_fields_from_config(config):
    config.FIELDS = [
        {'key': 'custom'},
        {'key': 'special'},
    ]
    doc = {
        'id': 'xxxx',
        'lat': '49.32545',
        'lon': '4.2565',
        'custom': 'rue',
        'special': 'Lilas',
        'thisone': 'is not indexed',
    }
    index_document(doc)
    assert DB.exists('d|xxxx')
    assert DB.exists('w|lilas')
    assert DB.exists('w|rue')
    assert not DB.exists('w|indexed')
Exemple #20
0
def pair(cmd, word):
    """See all token associated with a given token.
    PAIR lilas"""
    word = list(preprocess_query(word))[0]
    key = pair_key(word)
    tokens = [t.decode() for t in DB.smembers(key)]
    tokens.sort()
    print(white(tokens))
    print(magenta('(Total: {})'.format(len(tokens))))
Exemple #21
0
def test_null_value_should_not_be_index(config):
    doc = {
        'id': 'xxxx',
        'lat': '49.32545',
        'lon': '4.2565',
        'name': 'Port-Cergy',
        'city': ''
    }
    index_document(doc)
    assert 'city' not in DB.hgetall('d|xxxx')
Exemple #22
0
def test_field_with_only_non_alphanumeric_chars_is_not_indexed():
    doc = {
        'id': 'xxxx',
        'lat': '49.32545',
        'lon': '4.2565',
        'name': 'Lilas',
        'city': '//'
    }
    index_document(doc)
    assert 'city' not in DB.hgetall('d|xxxx')
Exemple #23
0
def _compute_onetomany_relations(tokens):
    relations = defaultdict(list)
    for token in tokens:
        for other in tokens:
            if other == token:
                continue
            if (token in relations[other]
                    or DB.sismember(pair_key(token), other)):
                relations[token].append(other)
    return relations
Exemple #24
0
def test_field_with_only_non_alphanumeric_chars_is_not_indexed():
    doc = {
        'id': 'xxxx',
        'lat': '49.32545',
        'lon': '4.2565',
        'name': 'Lilas',
        'city': '//'
    }
    index_document(doc)
    assert 'city' not in DB.hgetall('d|xxxx')
Exemple #25
0
def test_null_value_should_not_be_index(config):
    doc = {
        'id': 'xxxx',
        'lat': '49.32545',
        'lon': '4.2565',
        'name': 'Port-Cergy',
        'city': ''
    }
    index_document(doc)
    assert 'city' not in DB.hgetall('d|xxxx')
Exemple #26
0
def index_document(doc, **kwargs):
    key = keys.document_key(doc['id'])
    pipe = DB.pipeline()
    tokens = {}
    for indexer in config.INDEXERS:
        try:
            indexer(pipe, key, doc, tokens, **kwargs)
        except ValueError as e:
            print(e)
            return  # Do not index.
    pipe.execute()
Exemple #27
0
def index_document(doc, **kwargs):
    key = document_key(doc['id'])
    pipe = DB.pipeline()
    tokens = {}
    for indexer in config.INDEXERS:
        try:
            indexer(pipe, key, doc, tokens, **kwargs)
        except ValueError as e:
            print(e)
            return  # Do not index.
    pipe.execute()
def test_create_edge_ngrams(config):
    config.MIN_EDGE_NGRAMS = 2
    doc = {
        'id': 'xxxx',
        'lat': '49.32545',
        'lon': '4.2565',
        'name': '28 Lilas',  # 28 should not appear in ngrams
        'city': 'Paris'
    }
    index_document(doc, update_ngrams=False)
    assert not DB.exists('n|li')
    assert not DB.exists('n|lil')
    assert not DB.exists('n|pa')
    assert not DB.exists('n|par')
    create_edge_ngrams()
    assert DB.exists('n|li')
    assert DB.exists('n|lil')
    assert DB.exists('n|pa')
    assert DB.exists('n|par')
    assert not DB.exists('n|28')
    assert len(DB.keys()) == 12
Exemple #29
0
def do_fuzzyindex(self, word):
    """Compute fuzzy extensions of word that exist in index.
    FUZZYINDEX lilas"""
    word = list(preprocess_query(word))[0]
    token = Token(word)
    token.make_fuzzy()
    neighbors = [(n, DB.zcard(dbkeys.token_key(n))) for n in token.neighbors]
    neighbors.sort(key=lambda n: n[1], reverse=True)
    for token, freq in neighbors:
        if freq == 0:
            break
        print(white(token), blue(freq))
Exemple #30
0
def test_should_be_possible_to_override_boost_with_callable(config):
    config.FIELDS = [
        {
            'key': 'name',
            'boost': lambda doc: 5
        },
        {
            'key': 'city'
        },
    ]
    doc = {
        'id': 'xxxx',
        'lat': '49.32545',
        'lon': '4.2565',
        'name': 'Lilas',
        'city': 'Cergy'
    }
    index_document(doc)
    assert ds._DB.exists('d|xxxx')
    assert DB.zscore('w|lilas', 'd|xxxx') == 5
    assert DB.zscore('w|cergy', 'd|xxxx') == 1
Exemple #31
0
def do_fuzzyindex(self, word):
    """Compute fuzzy extensions of word that exist in index.
    FUZZYINDEX lilas"""
    word = list(preprocess_query(word))[0]
    token = Token(word)
    neighbors = make_fuzzy(token)
    neighbors = [(n, DB.zcard(dbkeys.token_key(n))) for n in neighbors]
    neighbors.sort(key=lambda n: n[1], reverse=True)
    for token, freq in neighbors:
        if freq == 0:
            break
        print(white(token), blue(freq))
Exemple #32
0
def test_should_be_possible_to_override_boost_from_config(config):
    config.FIELDS = [
        {
            'key': 'name',
            'boost': 5
        },
        {
            'key': 'city'
        },
    ]
    doc = {
        'id': 'xxxx',
        'lat': '49.32545',
        'lon': '4.2565',
        'name': 'Lilas',
        'city': 'Cergy'
    }
    index_document(doc)
    assert DB.exists('d|xxxx')
    assert DB.zscore('w|lila', 'd|xxxx') == 5
    assert DB.zscore('w|serji', 'd|xxxx') == 1
Exemple #33
0
def test_create_edge_ngrams(config):
    config.MIN_EDGE_NGRAMS = 2
    doc = {
        'id': 'xxxx',
        'lat': '49.32545',
        'lon': '4.2565',
        'name': '28 Lilas',  # 28 should not appear in ngrams
        'city': 'Paris'
    }
    index_document(doc, update_ngrams=False)
    assert not DB.exists('n|li')
    assert not DB.exists('n|lil')
    assert not DB.exists('n|pa')
    assert not DB.exists('n|par')
    create_edge_ngrams()
    assert DB.exists('n|li')
    assert DB.exists('n|lil')
    assert DB.exists('n|pa')
    assert DB.exists('n|par')
    assert not DB.exists('n|28')
    assert len(DB.keys()) == 12
Exemple #34
0
def test_index_housenumber_uses_housenumber_preprocessors(config):
    doc = {
        "id": "xxxx",
        "type": "street",
        "name": "rue des Lilas",
        "city": "Paris",
        "lat": "49.32545",
        "lon": "4.2565",
        "housenumbers": {"1 bis": {"lat": "48.325451", "lon": "2.25651"}},
    }
    index_document(doc)
    index = DB.hgetall("d|xxxx")
    assert index[b"h|1b"] == b"1 bis|48.325451|2.25651"
Exemple #35
0
def test_should_be_possible_to_define_fields_from_config(config):
    config.FIELDS = [
        {
            'key': 'custom'
        },
        {
            'key': 'special'
        },
    ]
    doc = {
        'id': 'xxxx',
        'lat': '49.32545',
        'lon': '4.2565',
        'custom': 'rue',
        'special': 'Lilas',
        'thisone': 'is not indexed',
    }
    index_document(doc)
    assert ds._DB.exists('d|xxxx')
    assert DB.exists('w|lilas')
    assert DB.exists('w|rue')
    assert not DB.exists('w|indexed')
Exemple #36
0
def test_doc_with_null_value_should_not_be_index_if_not_allowed(config):
    config.FIELDS = [
        {'key': 'name', 'null': False},
        {'key': 'city'},
    ]
    doc = {
        'id': 'xxxx',
        'lat': '49.32545',
        'lon': '4.2565',
        'name': '',
        'city': 'Cergy'
    }
    index_document(doc)
    assert not DB.exists('d|xxxx')
Exemple #37
0
def index_ngram_keys(*keys):
    pipe = DB.pipeline(transaction=False)
    for key in keys:
        key = key.decode()
        _, token = key.split('|')
        if token.isdigit():
            continue
        index_edge_ngrams(pipe, token)
    try:
        pipe.execute()
    except redis.RedisError as e:
        msg = 'Error while generating ngrams:\n{}'.format(str(e))
        raise ValueError(msg)
    return keys
Exemple #38
0
def index_document(doc, **kwargs):
    key = keys.document_key(doc['id'])
    pipe = DB.pipeline()
    tokens = {}
    for indexer in config.INDEXERS:
        try:
            indexer(pipe, key, doc, tokens, **kwargs)
        except ValueError as e:
            print(e)
            return  # Do not index.
    try:
        pipe.execute()
    except redis.RedisError as e:
        msg = 'Error while importing document:\n{}\n{}'.format(doc, str(e))
        raise ValueError(msg)
Exemple #39
0
def create_edge_ngrams():
    start = time.time()
    pool = Pool()
    count = 0
    chunk = []
    for key in DB.scan_iter(match='w|*'):
        count += 1
        chunk.append(key)
        if count % 10000 == 0:
            pool.map(index_ngram_key, chunk)
            print("Done", count, time.time() - start)
            chunk = []
    if chunk:
        pool.map(index_ngram_key, chunk)
    pool.close()
    pool.join()
    print('Done', count, 'in', time.time() - start)
Exemple #40
0
def create_edge_ngrams(*args):
    start = time.time()
    pool = Pool()
    count = 0
    chunk = []
    for key in DB.scan_iter(match='w|*'):
        count += 1
        chunk.append(key)
        if count % 10000 == 0:
            pool.map(index_ngram_key, chunk)
            print("Done", count, time.time() - start)
            chunk = []
    if chunk:
        pool.map(index_ngram_key, chunk)
    pool.close()
    pool.join()
    print('Done', count, 'in', time.time() - start)
Exemple #41
0
def index_documents(docs):
    pipe = DB.pipeline(transaction=False)
    for doc in docs:
        if not doc:
            continue
        if doc.get('_action') in ['delete', 'update']:
            key = keys.document_key(doc['_id']).encode()
            known_doc = get_document(key)
            if known_doc:
                deindex_document(known_doc)
        if doc.get('_action') in ['index', 'update', None]:
            index_document(pipe, doc)
        yield doc
    try:
        pipe.execute()
    except redis.RedisError as e:
        msg = 'Error while importing document:\n{}\n{}'.format(doc, str(e))
        raise ValueError(msg)
Exemple #42
0
def store_documents(docs):
    to_upsert = []
    to_remove = []
    for doc in docs:
        if not doc:
            continue
        if '_id' not in doc:
            doc['_id'] = DB.next_id()
        key = keys.document_key(doc['_id'])
        if doc.get('_action') in ['delete', 'update']:
            to_remove.append(key)
        if doc.get('_action') in ['index', 'update', None]:
            to_upsert.append((key, config.DOCUMENT_SERIALIZER.dumps(doc)))
        yield doc
    if to_remove:
        DS.remove(*to_remove)
    if to_upsert:
        DS.upsert(*to_upsert)
Exemple #43
0
def test_index_housenumber_uses_housenumber_preprocessors():
    # By default it glues ordinal to number
    doc = {
        'id': 'xxxx',
        'type': 'street',
        'name': 'rue des Lilas',
        'city': 'Paris',
        'lat': '49.32545',
        'lon': '4.2565',
        'housenumbers': {
            '1 bis': {
                'lat': '48.325451',
                'lon': '2.25651'
            }
        }
    }
    index_document(doc)
    index = DB.hgetall('d|xxxx')
    assert index[b'h|1b'] == b'1 bis|48.325451|2.25651'
Exemple #44
0
def test_index_housenumber_uses_housenumber_preprocessors():
    # By default it glues ordinal to number
    doc = {
        'id': 'xxxx',
        'type': 'street',
        'name': 'rue des Lilas',
        'city': 'Paris',
        'lat': '49.32545',
        'lon': '4.2565',
        'housenumbers': {
            '1 bis': {
                'lat': '48.325451',
                'lon': '2.25651'
            }
        }
    }
    index_document(doc)
    index = DB.hgetall('d|xxxx')
    assert index[b'h|1b'] == b'1 bis|48.325451|2.25651'
Exemple #45
0
def autocomplete(helper, tokens, skip_commons=False, use_geohash=False):
    helper.debug('Autocompleting %s', helper.last_token)
    keys = [t.db_key for t in tokens if not t.is_last]
    pair_keys = [pair_key(t) for t in tokens if not t.is_last]
    key = edge_ngram_key(helper.last_token)
    autocomplete_tokens = DB.sinter(pair_keys + [key])
    helper.debug('Found tokens to autocomplete %s', autocomplete_tokens)
    for token in autocomplete_tokens:
        key = dbkeys.token_key(token.decode())
        if skip_commons\
           and token_key_frequency(key) > config.COMMON_THRESHOLD:
            helper.debug('Skip common token to autocomplete %s', key)
            continue
        if not helper.bucket_overflow or helper.last_token in helper.not_found:
            helper.debug('Trying to extend bucket. Autocomplete %s', key)
            extra_keys = [key]
            if use_geohash and helper.geohash_key:
                extra_keys.append(helper.geohash_key)
            helper.add_to_bucket(keys + extra_keys)
Exemple #46
0
def test_doc_with_null_value_should_not_be_index_if_not_allowed(config):
    config.FIELDS = [
        {
            'key': 'name',
            'null': False
        },
        {
            'key': 'city'
        },
    ]
    doc = {
        'id': 'xxxx',
        'lat': '49.32545',
        'lon': '4.2565',
        'name': '',
        'city': 'Cergy'
    }
    index_document(doc)
    assert not DB.exists('w|cergy')
Exemple #47
0
def autocomplete(helper, tokens, skip_commons=False, use_geohash=False):
    helper.debug('Autocompleting %s', helper.last_token)
    # helper.last_token.autocomplete()
    keys = [t.db_key for t in tokens if not t.is_last]
    pair_keys = [pair_key(t) for t in tokens if not t.is_last]
    key = edge_ngram_key(helper.last_token)
    autocomplete_tokens = DB.sinter(pair_keys + [key])
    helper.debug('Found tokens to autocomplete %s', autocomplete_tokens)
    for token in autocomplete_tokens:
        key = dbkeys.token_key(token.decode())
        if skip_commons\
           and token_key_frequency(key) > config.COMMON_THRESHOLD:
            helper.debug('Skip common token to autocomplete %s', key)
            continue
        if not helper.bucket_overflow or helper.last_token in helper.not_found:
            helper.debug('Trying to extend bucket. Autocomplete %s', key)
            extra_keys = [key]
            if use_geohash and helper.geohash_key:
                extra_keys.append(helper.geohash_key)
            helper.add_to_bucket(keys + extra_keys)
Exemple #48
0
def try_fuzzy(helper, tokens, include_common=True):
    if not helper.bucket_dry or not tokens:
        return
    helper.debug('Fuzzy on. Trying with %s.', tokens)
    tokens.sort(key=lambda t: len(t), reverse=True)
    allkeys = helper.keys[:]
    if include_common:
        # As we are in fuzzy, try to narrow as much as possible by adding
        # unused common tokens.
        allkeys.extend(
            [t.db_key for t in helper.common if t.db_key not in helper.keys])
    for try_one in tokens:
        if helper.bucket_full:
            break
        keys = allkeys[:]
        if try_one.db_key in keys:
            keys.remove(try_one.db_key)
        if try_one.isdigit():
            continue
        helper.debug('Going fuzzy with %s and %s', try_one, keys)
        neighbors = make_fuzzy(try_one, max=helper.fuzzy)
        if len(keys):
            # Only retain tokens that have been seen in the index at least
            # once with the other tokens.
            DB.sadd(helper.pid, *neighbors)
            interkeys = [pair_key(k[2:]) for k in keys]
            interkeys.append(helper.pid)
            fuzzy_words = DB.sinter(interkeys)
            DB.delete(helper.pid)
            # Keep the priority we gave in building fuzzy terms (inversion
            # first, then substitution, etc.).
            fuzzy_words = [w.decode() for w in fuzzy_words]
            fuzzy_words.sort(key=lambda x: neighbors.index(x))
        else:
            # The token we are considering is alone.
            fuzzy_words = []
            for neighbor in neighbors:
                key = dbkeys.token_key(neighbor)
                count = DB.zcard(key)
                if count:
                    fuzzy_words.append(neighbor)
        if fuzzy_words:
            helper.debug('Found fuzzy candidates %s', fuzzy_words)
            fuzzy_keys = [dbkeys.token_key(w) for w in fuzzy_words]
            for key in fuzzy_keys:
                if helper.bucket_dry:
                    helper.add_to_bucket(keys + [key])
Exemple #49
0
def test_index_should_join_housenumbers_payload_fields(config):
    config.HOUSENUMBERS_PAYLOAD_FIELDS = ['key', 'one']
    doc = {
        'id': 'xxxx',
        'type': 'street',
        'name': 'rue des Lilas',
        'city': 'Paris',
        'lat': '49.32545',
        'lon': '4.2565',
        'housenumbers': {
            '1 bis': {
                'lat': '48.325451',
                'lon': '2.25651',
                'key': 'myvalue',
                'thisone': 'no',
                'one': 'two',
            }
        }
    }
    index_document(doc)
    index = DB.hgetall('d|xxxx')
    assert index[b'h|1bis'] == b'1 bis|48.325451|2.25651|myvalue|two'
Exemple #50
0
def try_fuzzy(helper, tokens, include_common=True):
    if not helper.bucket_dry or not tokens:
        return
    helper.debug('Fuzzy on. Trying with %s.', tokens)
    tokens.sort(key=lambda t: len(t), reverse=True)
    allkeys = helper.keys[:]
    if include_common:
        # As we are in fuzzy, try to narrow as much as possible by adding
        # unused commons tokens.
        common = [t for t in helper.common if t.db_key not in helper.keys]
        allkeys.extend([t.db_key for t in common])
    for try_one in tokens:
        if helper.bucket_full:
            break
        keys = allkeys[:]
        if try_one.db_key in keys:
            keys.remove(try_one.db_key)
        if try_one.isdigit():
            continue
        helper.debug('Going fuzzy with %s', try_one)
        neighbors = make_fuzzy(try_one, max=helper.fuzzy)
        if len(keys):
            # Only retains tokens that have been seen in the index at least
            # once with the other tokens.
            DB.sadd(helper.query, *neighbors)
            interkeys = [pair_key(k[2:]) for k in keys]
            interkeys.append(helper.query)
            fuzzy_words = DB.sinter(interkeys)
            DB.delete(helper.query)
            # Keep the priority we gave in building fuzzy terms (inversion
            # first, then substitution, etc.).
            fuzzy_words = [w.decode() for w in fuzzy_words]
            fuzzy_words.sort(key=lambda x: neighbors.index(x))
        else:
            # The token we are considering is alone.
            fuzzy_words = []
            for neighbor in neighbors:
                key = dbkeys.token_key(neighbor)
                count = DB.zcard(key)
                if count:
                    fuzzy_words.append(neighbor)
        helper.debug('Found fuzzy candidates %s', fuzzy_words)
        fuzzy_keys = [dbkeys.token_key(w) for w in fuzzy_words]
        for key in fuzzy_keys:
            if helper.bucket_dry:
                helper.add_to_bucket(keys + [key])
Exemple #51
0
def pytest_runtest_teardown(item, nextitem):
    from addok.db import DB
    assert DB.connection_pool.connection_kwargs['db'] == 15
    DB.flushdb()
Exemple #52
0
def edge_ngram_deindexer(db, key, doc, tokens, **kwargs):
    if config.INDEX_EDGE_NGRAMS:
        for token in tokens:
            tkey = dbkeys.token_key(token)
            if not DB.exists(tkey):
                deindex_edge_ngrams(token)
Exemple #53
0
def test_index_document():
    index_document(DOC.copy())
    assert DB.exists('d|xxxx')
    assert DB.type('d|xxxx') == b'hash'
    assert DB.exists('w|rue')
    assert b'd|xxxx' in DB.zrange('w|rue', 0, -1)
    assert DB.exists('w|des')
    assert DB.exists('w|lilas')
    assert DB.exists('w|andresy')
    assert DB.exists('w|un')  # Housenumber.
    assert DB.exists('p|rue')
    assert DB.exists('p|des')
    assert DB.exists('p|lilas')
    assert DB.exists('p|andresy')
    assert b'lilas' in DB.smembers('p|andresy')
    assert b'andresy' in DB.smembers('p|lilas')
    assert DB.exists('p|un')
    assert DB.exists('g|u09dgm7')
    assert b'd|xxxx' in DB.smembers('g|u09dgm7')
    assert DB.exists('n|lil')
    assert DB.exists('n|lila')
    assert DB.exists('n|and')
    assert b'andresy' in DB.smembers('n|and')
    assert DB.exists('n|andr')
    assert b'andresy' in DB.smembers('n|andr')
    assert DB.exists('n|andre')
    assert b'andresy' in DB.smembers('n|andre')
    assert DB.exists('n|andres')
    assert b'andresy' in DB.smembers('n|andres')
    assert b'lilas' in DB.smembers('n|lil')
    assert DB.exists('f|type|street')
    assert b'd|xxxx' in DB.smembers('f|type|street')
    assert DB.exists('f|type|housenumber')
    assert b'd|xxxx' in DB.smembers('f|type|housenumber')
    assert len(DB.keys()) == 20
Exemple #54
0
def count_keys():
    """Helper method to return the number of keys in the test database."""
    try:
        return DB.info()['db15']['keys']
    except KeyError:
        return 0
Exemple #55
0
def test_deindex_document_should_deindex():
    index_document(DOC.copy())
    deindex_document(DOC['id'])
    assert not DB.exists('d|xxxx')
    assert not DB.exists('w|de')
    assert not DB.exists('w|lilas')
    assert not DB.exists('w|un')  # Housenumber.
    assert not DB.exists('p|rue')
    assert not DB.exists('p|des')
    assert not DB.exists('p|lilas')
    assert not DB.exists('p|un')
    assert not DB.exists('g|u09dgm7')
    assert not DB.exists('n|lil')
    assert not DB.exists('n|and')
    assert not DB.exists('n|andr')
    assert not DB.exists('n|andre')
    assert not DB.exists('n|andres')
    assert not DB.exists('f|type|street')
    assert len(DB.keys()) == 0
Exemple #56
0
def test_deindex_document_should_not_affect_other_docs():
    DOC2 = {
        'id': 'xxxx2',
        'type': 'street',
        'name': 'rue des Lilas',
        'city': 'Paris',
        'lat': '49.32545',
        'lon': '4.2565',
        'housenumbers': {
            '1': {
                'lat': '48.325451',  # Same geohash as DOC.
                'lon': '2.25651'
            }
        }
    }
    index_document(DOC.copy())
    index_document(DOC2)
    deindex_document(DOC['id'])
    assert not DB.exists('d|xxxx')
    assert b'd|xxxx' not in DB.zrange('w|rue', 0, -1)
    assert b'd|xxxx' not in DB.zrange('w|des', 0, -1)
    assert b'd|xxxx' not in DB.zrange('w|lilas', 0, -1)
    assert b'd|xxxx' not in DB.zrange('w|un', 0, -1)
    assert DB.exists('g|u09dgm7')
    assert b'd|xxxx' not in DB.smembers('g|u09dgm7')
    assert DB.exists('w|des')
    assert DB.exists('w|lilas')
    assert DB.exists('w|un')  # Housenumber.
    assert DB.exists('p|rue')
    assert b'd|xxxx2' in DB.zrange('w|rue', 0, -1)
    assert b'd|xxxx2' in DB.zrange('w|des', 0, -1)
    assert b'd|xxxx2' in DB.zrange('w|lilas', 0, -1)
    assert b'd|xxxx2' in DB.zrange('w|un', 0, -1)
    assert b'd|xxxx2' in DB.smembers('g|u09dgm7')
    assert b'd|xxxx2' in DB.smembers('g|u0g08g7')
    assert DB.exists('p|des')
    assert DB.exists('p|lilas')
    assert DB.exists('p|un')
    assert not DB.exists('n|and')
    assert not DB.exists('n|andr')
    assert not DB.exists('n|andre')
    assert not DB.exists('n|andres')
    assert DB.exists('n|par')
    assert DB.exists('n|pari')
    assert DB.exists('n|lil')
    assert DB.exists('n|lila')
    assert b'lilas' in DB.smembers('n|lil')
    assert b'lilas' in DB.smembers('n|lila')
    assert DB.exists('f|type|street')
    assert b'd|xxxx2' in DB.smembers('f|type|street')
    assert DB.exists('f|type|housenumber')
    assert b'd|xxxx2' in DB.smembers('f|type|housenumber')
    assert len(DB.keys()) == 19
Exemple #57
0
def token_key_frequency(key):
    return DB.zcard(key)
Exemple #58
0
def deindex_token(key, token):
    tkey = keys.token_key(token)
    DB.zrem(tkey, key)
Exemple #59
0
def deindex_geohash(key, lat, lon):
    lat = float(lat)
    lon = float(lon)
    geoh = geohash.encode(lat, lon, config.GEOHASH_PRECISION)
    geok = keys.geohash_key(geoh)
    DB.srem(geok, key)
Exemple #60
0
 def search(self):
     if DB.exists(self.key):
         self.db_key = self.key