def test_make_fuzzy_should_remove_letter_if_world_is_long(): assert 'mt' not in make_fuzzy('mot') assert 'rain' in make_fuzzy('train') assert 'tain' in make_fuzzy('train') assert 'trin' in make_fuzzy('train') assert 'tran' in make_fuzzy('train') assert 'trai' in make_fuzzy('train')
def try_fuzzy(helper, tokens, include_common=True): if not helper.bucket_dry or not tokens: return helper.debug('Fuzzy on. Trying with %s.', tokens) tokens.sort(key=lambda t: len(t), reverse=True) allkeys = helper.keys[:] if include_common: # As we are in fuzzy, try to narrow as much as possible by adding # unused commons tokens. common = [t for t in helper.common if t.db_key not in helper.keys] allkeys.extend([t.db_key for t in common]) for try_one in tokens: if helper.bucket_full: break keys = allkeys[:] if try_one.db_key in keys: keys.remove(try_one.db_key) if try_one.isdigit(): continue helper.debug('Going fuzzy with %s', try_one) neighbors = make_fuzzy(try_one, max=helper.fuzzy) if len(keys): # Only retains tokens that have been seen in the index at least # once with the other tokens. DB.sadd(helper.query, *neighbors) interkeys = [pair_key(k[2:]) for k in keys] interkeys.append(helper.query) fuzzy_words = DB.sinter(interkeys) DB.delete(helper.query) # Keep the priority we gave in building fuzzy terms (inversion # first, then substitution, etc.). fuzzy_words = [w.decode() for w in fuzzy_words] fuzzy_words.sort(key=lambda x: neighbors.index(x)) else: # The token we are considering is alone. fuzzy_words = [] for neighbor in neighbors: key = dbkeys.token_key(neighbor) count = DB.zcard(key) if count: fuzzy_words.append(neighbor) helper.debug('Found fuzzy candidates %s', fuzzy_words) fuzzy_keys = [dbkeys.token_key(w) for w in fuzzy_words] for key in fuzzy_keys: if helper.bucket_dry: helper.add_to_bucket(keys + [key])
def test_make_fuzzy_should_extend_term(): expected = set([ 'omt', 'mto', 'amot', 'maot', 'moat', 'mota', 'bmot', 'mbot', 'mobt', 'motb', 'cmot', 'mcot', 'moct', 'motc', 'dmot', 'mdot', 'modt', 'motd', 'emot', 'meot', 'moet', 'mote', 'fmot', 'mfot', 'moft', 'motf', 'gmot', 'mgot', 'mogt', 'motg', 'hmot', 'mhot', 'moht', 'moth', 'imot', 'miot', 'moit', 'moti', 'jmot', 'mjot', 'mojt', 'motj', 'kmot', 'mkot', 'mokt', 'motk', 'lmot', 'mlot', 'molt', 'motl', 'mmot', 'mmot', 'momt', 'motm', 'nmot', 'mnot', 'mont', 'motn', 'omot', 'moot', 'moot', 'moto', 'pmot', 'mpot', 'mopt', 'motp', 'qmot', 'mqot', 'moqt', 'motq', 'rmot', 'mrot', 'mort', 'motr', 'smot', 'msot', 'most', 'mots', 'tmot', 'mtot', 'mott', 'mott', 'umot', 'muot', 'mout', 'motu', 'vmot', 'mvot', 'movt', 'motv', 'wmot', 'mwot', 'mowt', 'motw', 'xmot', 'mxot', 'moxt', 'motx', 'ymot', 'myot', 'moyt', 'moty', 'zmot', 'mzot', 'mozt', 'motz', 'aot', 'mat', 'moa', 'bot', 'mbt', 'mob', 'cot', 'mct', 'moc', 'dot', 'mdt', 'mod', 'eot', 'met', 'moe', 'fot', 'mft', 'mof', 'got', 'mgt', 'mog', 'hot', 'mht', 'moh', 'iot', 'mit', 'moi', 'jot', 'mjt', 'moj', 'kot', 'mkt', 'mok', 'lot', 'mlt', 'mol', 'mmt', 'mom', 'not', 'mnt', 'mon', 'oot', 'moo', 'pot', 'mpt', 'mop', 'qot', 'mqt', 'moq', 'rot', 'mrt', 'mor', 'sot', 'mst', 'mos', 'tot', 'mtt', 'uot', 'mut', 'mou', 'vot', 'mvt', 'mov', 'wot', 'mwt', 'mow', 'xot', 'mxt', 'mox', 'yot', 'myt', 'moy', 'zot', 'mzt', 'moz', ]) assert set(make_fuzzy('mot')) == expected