Exemple #1
0
def normalized_tokens(s, string_options=DEFAULT_STRING_OPTIONS,
                      token_options=DEFAULT_TOKEN_OPTIONS,
                      strip_parentheticals=True):
    '''
    Normalizes a string, tokenizes, and normalizes each token
    with string and token-level options.

    This version only uses libpostal's deterministic normalizations
    i.e. methods with a single output. The string tree version will
    return multiple normalized strings, each with tokens.

    Usage:
        normalized_tokens(u'St.-Barthélemy')
    '''
    s = safe_decode(s)
    if string_options & _normalize.NORMALIZE_STRING_LATIN_ASCII:
        normalized = _normalize.normalize_string_latin(s, string_options)
    else:
        normalized = _normalize.normalize_string_utf8(s, string_options)

    # Tuples of (offset, len, type)
    raw_tokens = tokenize_raw(normalized)
    tokens = [(_normalize.normalize_token(normalized, t, token_options),
               token_types.from_id(t[-1])) for t in raw_tokens]

    if strip_parentheticals:
        return remove_parens(tokens)
    else:
        return tokens
Exemple #2
0
def parse_address(address, language=None, country=None):
    '''
    @param address: the address as either Unicode or a UTF-8 encoded string
    @param language (optional): language code
    @param country (optional): country code
    '''
    address = safe_decode(address, 'utf-8')
    return _parser.parse_address(address, language=language, country=country)
Exemple #3
0
def parse_address(address, language=None, country=None):
    '''
    @param address: the address as either Unicode or a UTF-8 encoded string
    @param language (optional): language code
    @param country (optional): country code
    '''
    address = safe_decode(address, 'utf-8')
    return _parser.parse_address(address, language=language, country=country)
Exemple #4
0
def expand_address(address, languages=DEFAULT_LANGUAGES, **kw):
    '''
    @param address: the address as either Unicode or a UTF-8 encoded string
    @param languages: a tuple or list of ISO language code strings (e.g. "en", "fr", "de", etc.)
                      to use in expansion. Default is English. Until automatic language classification
                      is ready in libpostal, this parameter is required.

    '''
    address = safe_decode(address, 'utf-8')
    return _expand.expand_address(address, languages=languages, **kw)
Exemple #5
0
def expand_address(address, languages=DEFAULT_LANGUAGES, **kw):
    '''
    @param address: the address as either Unicode or a UTF-8 encoded string
    @param languages: a tuple or list of ISO language code strings (e.g. "en", "fr", "de", etc.)
                      to use in expansion. Default is English. Until automatic language classification
                      is ready in libpostal, this parameter is required.

    '''
    address = safe_decode(address, 'utf-8')
    return _expand.expand_address(address, languages=languages, **kw)
Exemple #6
0
def tokenize(s):
    u = safe_decode(s)
    s = safe_encode(s)
    return [(safe_decode(s[start:start + length]), token_types.from_id(token_type))
            for start, length, token_type in _tokenize.tokenize(u)]
Exemple #7
0
def tokenize_raw(s):
    return _tokenize.tokenize(safe_decode(s))
Exemple #8
0
def tokenize(s):
    u = safe_decode(s)
    s = safe_encode(s)
    return [(safe_decode(s[start:start + length]),
             token_types.from_id(token_type))
            for start, length, token_type in _tokenize.tokenize(u)]
Exemple #9
0
def tokenize_raw(s):
    return _tokenize.tokenize(safe_decode(s))