def deindex(db, key, doc, tokens, **kwargs): housenumbers = doc.get('housenumbers', {}) for hn, data in housenumbers.items(): for token in tokens: k = '|'.join(['didx', hn, token]) commons = db.zinterstore(k, [keys.token_key(hn), keys.token_key(token)]) db.delete(k) if not commons: db.srem(pair_key(hn), token) db.srem(pair_key(token), hn)
def deindex(db, key, doc, tokens, **kwargs): tokens = list(set(tokens)) # Unique values. for i, token in enumerate(tokens): for token2 in tokens[i:]: if token != token2: tmp_key = '|'.join(['didx', token, token2]) # Do we have other documents that share token and token2? commons = db.zinterstore(tmp_key, [keys.token_key(token), keys.token_key(token2)]) db.delete(tmp_key) if not commons: db.srem(pair_key(token), token2) db.srem(pair_key(token2), token)
def housenumbers_pairs_deindexer(db, key, doc, tokens, **kwargs): for field, value in doc.items(): field = field.decode() if not field.startswith('h|'): continue number, lat, lon, *extra = value.decode().split('|') hn = field[2:] for token in tokens: k = '|'.join(['didx', hn, token]) commons = db.zinterstore(k, [keys.token_key(hn), keys.token_key(token)]) db.delete(k) if not commons: db.srem(pair_key(hn), token) db.srem(pair_key(token), hn)
def pairs_deindexer(db, key, doc, tokens, **kwargs): els = list(set(tokens)) # Unique values. loop = 0 for el in els: for el2 in els[loop:]: if el != el2: key = '|'.join(['didx', el, el2]) # Do we have other documents that share el and el2? commons = db.zinterstore(key, [keys.token_key(el), keys.token_key(el2)]) db.delete(key) if not commons: db.srem(pair_key(el), el2) db.srem(pair_key(el2), el) loop += 1
def deindex(db, key, doc, tokens, **kwargs): els = list(set(tokens)) # Unique values. loop = 0 for el in els: for el2 in els[loop:]: if el != el2: key = '|'.join(['didx', el, el2]) # Do we have other documents that share el and el2? commons = db.zinterstore(key, [keys.token_key(el), keys.token_key(el2)]) db.delete(key) if not commons: db.srem(pair_key(el), el2) db.srem(pair_key(el2), el) loop += 1
def housenumbers_pairs_deindexer(db, key, doc, tokens, **kwargs): for field, value in doc.items(): field = field.decode() if not field.startswith('h|'): continue number, lat, lon, *extra = value.decode().split('|') hn = field[2:] for token in tokens: k = '|'.join(['didx', hn, token]) commons = db.zinterstore( k, [keys.token_key(hn), keys.token_key(token)]) db.delete(k) if not commons: db.srem(pair_key(hn), token) db.srem(pair_key(token), hn)
def try_fuzzy(helper, tokens, include_common=True): if not helper.bucket_dry or not tokens: return helper.debug('Fuzzy on. Trying with %s.', tokens) tokens.sort(key=lambda t: len(t), reverse=True) allkeys = helper.keys[:] if include_common: # As we are in fuzzy, try to narrow as much as possible by adding # unused common tokens. allkeys.extend( [t.db_key for t in helper.common if t.db_key not in helper.keys]) for try_one in tokens: if helper.bucket_full: break keys = allkeys[:] if try_one.db_key in keys: keys.remove(try_one.db_key) if try_one.isdigit(): continue helper.debug('Going fuzzy with %s and %s', try_one, keys) neighbors = make_fuzzy(try_one, max=helper.fuzzy) if len(keys): # Only retain tokens that have been seen in the index at least # once with the other tokens. DB.sadd(helper.pid, *neighbors) interkeys = [pair_key(k[2:]) for k in keys] interkeys.append(helper.pid) fuzzy_words = DB.sinter(interkeys) DB.delete(helper.pid) # Keep the priority we gave in building fuzzy terms (inversion # first, then substitution, etc.). fuzzy_words = [w.decode() for w in fuzzy_words] fuzzy_words.sort(key=lambda x: neighbors.index(x)) else: # The token we are considering is alone. fuzzy_words = [] for neighbor in neighbors: key = dbkeys.token_key(neighbor) count = DB.zcard(key) if count: fuzzy_words.append(neighbor) if fuzzy_words: helper.debug('Found fuzzy candidates %s', fuzzy_words) fuzzy_keys = [dbkeys.token_key(w) for w in fuzzy_words] for key in fuzzy_keys: if helper.bucket_dry: helper.add_to_bucket(keys + [key])
def try_fuzzy(helper, tokens, include_common=True): if not helper.bucket_dry or not tokens: return helper.debug('Fuzzy on. Trying with %s.', tokens) tokens.sort(key=lambda t: len(t), reverse=True) allkeys = helper.keys[:] if include_common: # As we are in fuzzy, try to narrow as much as possible by adding # unused commons tokens. common = [t for t in helper.common if t.db_key not in helper.keys] allkeys.extend([t.db_key for t in common]) for try_one in tokens: if helper.bucket_full: break keys = allkeys[:] if try_one.db_key in keys: keys.remove(try_one.db_key) if try_one.isdigit(): continue helper.debug('Going fuzzy with %s', try_one) neighbors = make_fuzzy(try_one, max=helper.fuzzy) if len(keys): # Only retains tokens that have been seen in the index at least # once with the other tokens. DB.sadd(helper.query, *neighbors) interkeys = [pair_key(k[2:]) for k in keys] interkeys.append(helper.query) fuzzy_words = DB.sinter(interkeys) DB.delete(helper.query) # Keep the priority we gave in building fuzzy terms (inversion # first, then substitution, etc.). fuzzy_words = [w.decode() for w in fuzzy_words] fuzzy_words.sort(key=lambda x: neighbors.index(x)) else: # The token we are considering is alone. fuzzy_words = [] for neighbor in neighbors: key = dbkeys.token_key(neighbor) count = DB.zcard(key) if count: fuzzy_words.append(neighbor) helper.debug('Found fuzzy candidates %s', fuzzy_words) fuzzy_keys = [dbkeys.token_key(w) for w in fuzzy_words] for key in fuzzy_keys: if helper.bucket_dry: helper.add_to_bucket(keys + [key])
def do_fuzzyindex(self, word): """Compute fuzzy extensions of word that exist in index. FUZZYINDEX lilas""" word = list(preprocess_query(word))[0] token = Token(word) neighbors = make_fuzzy(token) neighbors = [(n, DB.zcard(dbkeys.token_key(n))) for n in neighbors] neighbors.sort(key=lambda n: n[1], reverse=True) for token, freq in neighbors: if freq == 0: break print(white(token), blue(freq))
def do_fuzzyindex(self, word): """Compute fuzzy extensions of word that exist in index. FUZZYINDEX lilas""" word = list(preprocess_query(word))[0] token = Token(word) token.make_fuzzy() neighbors = [(n, DB.zcard(dbkeys.token_key(n))) for n in token.neighbors] neighbors.sort(key=lambda n: n[1], reverse=True) for token, freq in neighbors: if freq == 0: break print(white(token), blue(freq))
def autocomplete(helper, tokens, skip_commons=False, use_geohash=False): helper.debug('Autocompleting %s', helper.last_token) keys = [t.db_key for t in tokens if not t.is_last] pair_keys = [pair_key(t) for t in tokens if not t.is_last] key = edge_ngram_key(helper.last_token) autocomplete_tokens = DB.sinter(pair_keys + [key]) helper.debug('Found tokens to autocomplete %s', autocomplete_tokens) for token in autocomplete_tokens: key = dbkeys.token_key(token.decode()) if skip_commons\ and token_key_frequency(key) > config.COMMON_THRESHOLD: helper.debug('Skip common token to autocomplete %s', key) continue if not helper.bucket_overflow or helper.last_token in helper.not_found: helper.debug('Trying to extend bucket. Autocomplete %s', key) extra_keys = [key] if use_geohash and helper.geohash_key: extra_keys.append(helper.geohash_key) helper.add_to_bucket(keys + extra_keys)
def autocomplete(helper, tokens, skip_commons=False, use_geohash=False): helper.debug('Autocompleting %s', helper.last_token) # helper.last_token.autocomplete() keys = [t.db_key for t in tokens if not t.is_last] pair_keys = [pair_key(t) for t in tokens if not t.is_last] key = edge_ngram_key(helper.last_token) autocomplete_tokens = DB.sinter(pair_keys + [key]) helper.debug('Found tokens to autocomplete %s', autocomplete_tokens) for token in autocomplete_tokens: key = dbkeys.token_key(token.decode()) if skip_commons\ and token_key_frequency(key) > config.COMMON_THRESHOLD: helper.debug('Skip common token to autocomplete %s', key) continue if not helper.bucket_overflow or helper.last_token in helper.not_found: helper.debug('Trying to extend bucket. Autocomplete %s', key) extra_keys = [key] if use_geohash and helper.geohash_key: extra_keys.append(helper.geohash_key) helper.add_to_bucket(keys + extra_keys)
def deindex(db, key, doc, tokens, **kwargs): if config.INDEX_EDGE_NGRAMS: for token in tokens: tkey = dbkeys.token_key(token) if not DB.exists(tkey): deindex_edge_ngrams(token)
def key(self): if not hasattr(self, '_key'): self._key = keys.token_key(self) return self._key
def edge_ngram_deindexer(db, key, doc, tokens, **kwargs): if config.INDEX_EDGE_NGRAMS: for token in tokens: tkey = dbkeys.token_key(token) if not DB.exists(tkey): deindex_edge_ngrams(token)