TYPES_REGEX = '|'.join( map(lambda x: '[{}{}]{}'.format(x[0], x[0].upper(), x[1:]), TYPES) ) def _clean_query(q): q = re.sub('c(e|é)dex ?[\d]*', '', q, flags=re.IGNORECASE) q = re.sub('bp ?[\d]*', '', q, flags=re.IGNORECASE) q = re.sub('cs ?[\d]*', '', q, flags=re.IGNORECASE) q = re.sub('\d{,2}(e|[eè]me) ([eé]tage)', '', q, flags=re.IGNORECASE) q = re.sub(' {2,}', ' ', q, flags=re.IGNORECASE) q = re.sub('[ -]s/[ -]', ' sur ', q, flags=re.IGNORECASE) q = re.sub('[ -]s/s[ -]', ' sous ', q, flags=re.IGNORECASE) q = q.strip() return q clean_query = yielder(_clean_query) def _extract_address(q): m = extract_address_pattern.search(q) return m.group() if m else q extract_address_pattern = re.compile( '(\d+( *(bis|ter))?,? +(' + TYPES_REGEX + ') .*(\d{5})?).*', flags=re.IGNORECASE) extract_address = yielder(_extract_address) def _glue_ordinal(q): """Glue '3' and 'bis'.""" return glue_ordinal_pattern.sub('\g<1>\g<2>\g<3>', q) ORDINAL_REGEX = 'bis|ter|quater|quinquies|sexies|[a-z]'
("ck", "k"), ("ph", "f"), ("th$", "te"), # This t sounds. ("(?<=[^sc0-9])h", ""), ("^h(?=.)+", ""), ("sc", "s"), ("sh", "ch"), ("((?<=[^0-9])w|^w)", "v"), ("c(?=[eiy])", "s"), ("(?<=[^0-9])y", "i"), ("esn", "en"), ("oe(?=\\w)", "e"), ("(?<=[^0-9])s$", ""), ("(?<=u)l?x$", ""), # eaux, eux, aux, aulx ("(?<=u)lt$", "t"), ("(?<=[a-z])[dg]$", ""), ("(?<=[^es0-9])t$", ""), ("(?<=[aeiou])(m)(?=[pbgf])", "n"), ("(?<=[a-z]{2})(e$)", ""), # Remove "e" at last position only if # it follows two letters? ("(\\D)(?=\\1)", ""), # Remove duplicate letters. ) _s = s for pattern, repl in rules: _s = re.sub(pattern, repl, _s) _CACHE[s] = _s return _CACHE[s] phonemicize = yielder(_phonemicize)
map(lambda x: '[{}{}]{}'.format(x[0], x[0].upper(), x[1:]), TYPES)) def _clean_query(q): q = re.sub('c(e|é)dex ?[\d]*', '', q, flags=re.IGNORECASE) q = re.sub('bp ?[\d]*', '', q, flags=re.IGNORECASE) q = re.sub('cs ?[\d]*', '', q, flags=re.IGNORECASE) q = re.sub('\d{,2}(e|[eè]me) ([eé]tage)', '', q, flags=re.IGNORECASE) q = re.sub(' {2,}', ' ', q, flags=re.IGNORECASE) q = re.sub('[ -]s/[ -]', ' sur ', q, flags=re.IGNORECASE) q = re.sub('[ -]s/s[ -]', ' sous ', q, flags=re.IGNORECASE) q = q.strip() return q clean_query = yielder(_clean_query) def _extract_address(q): m = extract_address_pattern.search(q) return m.group() if m else q extract_address_pattern = re.compile('(\d+( *(bis|ter))?,? +(' + TYPES_REGEX + ') .*(\d{5})?).*', flags=re.IGNORECASE) extract_address = yielder(_extract_address) def _glue_ordinal(q): """Glue '3' and 'bis'."""
from addok.utils import yielder from . import normalize as _normalize from . import synonymize as _synonymize from . import tokenize as _tokenize def tokenize(pipe): for text in pipe: for token in _tokenize(text): yield token normalize = yielder(_normalize) synonymize = yielder(_synonymize)
("cc(?=[ie])", "s"), # Others will hit the c => k and deduplicate ("ck", "k"), ("ph", "f"), ("th$", "te"), # This t sounds. ("(?<=[^sc0-9])h", ""), ("^h(?=.)+", ""), ("sc", "s"), ("sh", "ch"), ("((?<=[^0-9])w|^w)", "v"), ("c(?=[eiy])", "s"), ("(?<=[^0-9])y", "i"), ("esn", "en"), ("oe(?=\\w)", "e"), ("(?<=[^0-9])s$", ""), ("(?<=u)l?x$", ""), # eaux, eux, aux, aulx ("(?<=u)lt$", "t"), ("(?<=[a-z])[dg]$", ""), ("(?<=[^es0-9])t$", ""), ("(?<=[aeiou])(m)(?=[pbgf])", "n"), ("(?<=[a-z]{2})(e$)", ""), # Remove "e" at last position only if # it follows two letters? ("(\\D)(?=\\1)", ""), # Remove duplicate letters. ) _s = s for pattern, repl in rules: _s = re.sub(pattern, repl, _s) _CACHE[s] = _s return _CACHE[s] phonemicize = yielder(_phonemicize)