예제 #1
0
TYPES_REGEX = '|'.join(
    map(lambda x: '[{}{}]{}'.format(x[0], x[0].upper(), x[1:]), TYPES)
)


def _clean_query(q):
    q = re.sub('c(e|é)dex ?[\d]*', '', q, flags=re.IGNORECASE)
    q = re.sub('bp ?[\d]*', '', q, flags=re.IGNORECASE)
    q = re.sub('cs ?[\d]*', '', q, flags=re.IGNORECASE)
    q = re.sub('\d{,2}(e|[eè]me) ([eé]tage)', '', q, flags=re.IGNORECASE)
    q = re.sub(' {2,}', ' ', q, flags=re.IGNORECASE)
    q = re.sub('[ -]s/[ -]', ' sur ', q, flags=re.IGNORECASE)
    q = re.sub('[ -]s/s[ -]', ' sous ', q, flags=re.IGNORECASE)
    q = q.strip()
    return q
clean_query = yielder(_clean_query)


def _extract_address(q):
    m = extract_address_pattern.search(q)
    return m.group() if m else q
extract_address_pattern = re.compile(
    '(\d+( *(bis|ter))?,? +(' + TYPES_REGEX + ') .*(\d{5})?).*',
    flags=re.IGNORECASE)
extract_address = yielder(_extract_address)


def _glue_ordinal(q):
    """Glue '3' and 'bis'."""
    return glue_ordinal_pattern.sub('\g<1>\g<2>\g<3>', q)
ORDINAL_REGEX = 'bis|ter|quater|quinquies|sexies|[a-z]'
예제 #2
0
            ("ck", "k"),
            ("ph", "f"),
            ("th$", "te"),  # This t sounds.
            ("(?<=[^sc0-9])h", ""),
            ("^h(?=.)+", ""),
            ("sc", "s"),
            ("sh", "ch"),
            ("((?<=[^0-9])w|^w)", "v"),
            ("c(?=[eiy])", "s"),
            ("(?<=[^0-9])y", "i"),
            ("esn", "en"),
            ("oe(?=\\w)", "e"),
            ("(?<=[^0-9])s$", ""),
            ("(?<=u)l?x$", ""),  # eaux, eux, aux, aulx
            ("(?<=u)lt$", "t"),
            ("(?<=[a-z])[dg]$", ""),
            ("(?<=[^es0-9])t$", ""),
            ("(?<=[aeiou])(m)(?=[pbgf])", "n"),
            ("(?<=[a-z]{2})(e$)", ""),  # Remove "e" at last position only if
            # it follows two letters?
            ("(\\D)(?=\\1)", ""),  # Remove duplicate letters.
        )
        _s = s
        for pattern, repl in rules:
            _s = re.sub(pattern, repl, _s)
        _CACHE[s] = _s
    return _CACHE[s]


phonemicize = yielder(_phonemicize)
예제 #3
0
    map(lambda x: '[{}{}]{}'.format(x[0], x[0].upper(), x[1:]), TYPES))


def _clean_query(q):
    q = re.sub('c(e|é)dex ?[\d]*', '', q, flags=re.IGNORECASE)
    q = re.sub('bp ?[\d]*', '', q, flags=re.IGNORECASE)
    q = re.sub('cs ?[\d]*', '', q, flags=re.IGNORECASE)
    q = re.sub('\d{,2}(e|[eè]me) ([eé]tage)', '', q, flags=re.IGNORECASE)
    q = re.sub(' {2,}', ' ', q, flags=re.IGNORECASE)
    q = re.sub('[ -]s/[ -]', ' sur ', q, flags=re.IGNORECASE)
    q = re.sub('[ -]s/s[ -]', ' sous ', q, flags=re.IGNORECASE)
    q = q.strip()
    return q


clean_query = yielder(_clean_query)


def _extract_address(q):
    m = extract_address_pattern.search(q)
    return m.group() if m else q


extract_address_pattern = re.compile('(\d+( *(bis|ter))?,? +(' + TYPES_REGEX +
                                     ') .*(\d{5})?).*',
                                     flags=re.IGNORECASE)
extract_address = yielder(_extract_address)


def _glue_ordinal(q):
    """Glue '3' and 'bis'."""
예제 #4
0
from addok.utils import yielder

from . import normalize as _normalize
from . import synonymize as _synonymize
from . import tokenize as _tokenize


def tokenize(pipe):
    for text in pipe:
        for token in _tokenize(text):
            yield token


normalize = yielder(_normalize)
synonymize = yielder(_synonymize)
예제 #5
0
파일: fr.py 프로젝트: REIMSMetropole/addok
            ("cc(?=[ie])", "s"),  # Others will hit the c => k and deduplicate
            ("ck", "k"),
            ("ph", "f"),
            ("th$", "te"),  # This t sounds.
            ("(?<=[^sc0-9])h", ""),
            ("^h(?=.)+", ""),
            ("sc", "s"),
            ("sh", "ch"),
            ("((?<=[^0-9])w|^w)", "v"),
            ("c(?=[eiy])", "s"),
            ("(?<=[^0-9])y", "i"),
            ("esn", "en"),
            ("oe(?=\\w)", "e"),
            ("(?<=[^0-9])s$", ""),
            ("(?<=u)l?x$", ""),  # eaux, eux, aux, aulx
            ("(?<=u)lt$", "t"),
            ("(?<=[a-z])[dg]$", ""),
            ("(?<=[^es0-9])t$", ""),
            ("(?<=[aeiou])(m)(?=[pbgf])", "n"),
            ("(?<=[a-z]{2})(e$)", ""),  # Remove "e" at last position only if
                                        # it follows two letters?
            ("(\\D)(?=\\1)", ""),  # Remove duplicate letters.
        )
        _s = s
        for pattern, repl in rules:
            _s = re.sub(pattern, repl, _s)
        _CACHE[s] = _s
    return _CACHE[s]

phonemicize = yielder(_phonemicize)