コード例 #1
0
def normalized_tokens(s, string_options=DEFAULT_STRING_OPTIONS,
                      token_options=DEFAULT_TOKEN_OPTIONS,
                      strip_parentheticals=True):
    '''
    Normalizes a string, tokenizes, and normalizes each token
    with string and token-level options.

    This version only uses libpostal's deterministic normalizations
    i.e. methods with a single output. The string tree version will
    return multiple normalized strings, each with tokens.

    Usage:
        normalized_tokens(u'St.-Barthélemy')
    '''
    s = safe_decode(s)
    if string_options & _normalize.NORMALIZE_STRING_LATIN_ASCII:
        normalized = _normalize.normalize_string_latin(s, string_options)
    else:
        normalized = _normalize.normalize_string_utf8(s, string_options)

    # Tuples of (offset, len, type)
    raw_tokens = tokenize_raw(normalized)
    tokens = [(_normalize.normalize_token(normalized, t, token_options),
               token_types.from_id(t[-1])) for t in raw_tokens]

    if strip_parentheticals:
        return remove_parens(tokens)
    else:
        return tokens
コード例 #2
0
ファイル: parser.py プロジェクト: nvkelso/libpostal
def parse_address(address, language=None, country=None):
    '''
    @param address: the address as either Unicode or a UTF-8 encoded string
    @param language (optional): language code
    @param country (optional): country code
    '''
    address = safe_decode(address, 'utf-8')
    return _parser.parse_address(address, language=language, country=country)
コード例 #3
0
ファイル: parser.py プロジェクト: pombredanne/libpostal
def parse_address(address, language=None, country=None):
    '''
    @param address: the address as either Unicode or a UTF-8 encoded string
    @param language (optional): language code
    @param country (optional): country code
    '''
    address = safe_decode(address, 'utf-8')
    return _parser.parse_address(address, language=language, country=country)
コード例 #4
0
ファイル: expand.py プロジェクト: nvkelso/libpostal
def expand_address(address, languages=DEFAULT_LANGUAGES, **kw):
    '''
    @param address: the address as either Unicode or a UTF-8 encoded string
    @param languages: a tuple or list of ISO language code strings (e.g. "en", "fr", "de", etc.)
                      to use in expansion. Default is English. Until automatic language classification
                      is ready in libpostal, this parameter is required.

    '''
    address = safe_decode(address, 'utf-8')
    return _expand.expand_address(address, languages=languages, **kw)
コード例 #5
0
ファイル: expand.py プロジェクト: pombredanne/libpostal
def expand_address(address, languages=DEFAULT_LANGUAGES, **kw):
    '''
    @param address: the address as either Unicode or a UTF-8 encoded string
    @param languages: a tuple or list of ISO language code strings (e.g. "en", "fr", "de", etc.)
                      to use in expansion. Default is English. Until automatic language classification
                      is ready in libpostal, this parameter is required.

    '''
    address = safe_decode(address, 'utf-8')
    return _expand.expand_address(address, languages=languages, **kw)
コード例 #6
0
ファイル: tokenize.py プロジェクト: nvkelso/libpostal
def tokenize(s):
    u = safe_decode(s)
    s = safe_encode(s)
    return [(safe_decode(s[start:start + length]), token_types.from_id(token_type))
            for start, length, token_type in _tokenize.tokenize(u)]
コード例 #7
0
ファイル: tokenize.py プロジェクト: nvkelso/libpostal
def tokenize_raw(s):
    return _tokenize.tokenize(safe_decode(s))
コード例 #8
0
ファイル: tokenize.py プロジェクト: pombredanne/libpostal
def tokenize(s):
    u = safe_decode(s)
    s = safe_encode(s)
    return [(safe_decode(s[start:start + length]),
             token_types.from_id(token_type))
            for start, length, token_type in _tokenize.tokenize(u)]
コード例 #9
0
ファイル: tokenize.py プロジェクト: pombredanne/libpostal
def tokenize_raw(s):
    return _tokenize.tokenize(safe_decode(s))