Python safe_decode Exemples, postal.utils.encoding.safe_decode Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : normalize.py Projet : subhashreehazra/parse

def normalized_tokens(s,
                      string_options=DEFAULT_STRING_OPTIONS,
                      token_options=DEFAULT_TOKEN_OPTIONS,
                      strip_parentheticals=True,
                      whitespace=False,
                      languages=None):
    '''
    Normalizes a string, tokenizes, and normalizes each token
    with string and token-level options.

    This version only uses libpostal's deterministic normalizations
    i.e. methods with a single output. The string tree version will
    return multiple normalized strings, each with tokens.

    Usage:
        normalized_tokens(u'St.-Barthélemy')
    '''
    s = safe_decode(s)
    normalized_tokens = _normalize.normalized_tokens(s,
                                                     string_options,
                                                     token_options,
                                                     whitespace,
                                                     languages=languages)

    if strip_parentheticals:
        normalized_tokens = remove_parens(normalized_tokens)

    return [(s, token_types.from_id(token_type))
            for s, token_type in normalized_tokens]

Exemple #2

0

Afficher le fichier

def parse_address(address, language=None, country=None):
    """
    Parse address into components.

    @param address: the address as either Unicode or a UTF-8 encoded string
    @param language (optional): language code
    @param country (optional): country code
    """
    address = safe_decode(address, 'utf-8')
    return _parser.parse_address(address, language=language, country=country)

Exemple #3

0

Afficher le fichier

def expand_address(address, languages=None, **kw):
"""
Expand the given address into one or more normalized strings.

Required
--------
@param address: the address as either Unicode or a UTF-8 encoded string

Options
-------
@param languages: a tuple or list of ISO language code strings (e.g. "en", "fr", "de", etc.)
to use in expansion. If None is passed, use language classifier
to detect language automatically.
@param address_components: an integer (bit-set) of address component expansions
to use e.g. ADDRESS_NAME | ADDRESS_STREET would use
only expansions which apply to venue names or streets.
@param latin_ascii: use the Latin to ASCII transliterator, which normalizes e.g. æ => ae
@param transliterate: use any available transliterators for non-Latin scripts, e.g.
for the Greek phrase διαφορετικούς becomes diaphoretikoús̱
@param strip_accents: strip accented characters e.g. é => e, ç => c. This loses some
information in various languags, but in general we want
@param decompose: perform Unicode normalization (NFD form)
@param lowercase: UTF-8 lowercase the string
@param trim_string: trim spaces on either side of the string
@param replace_word_hyphens: add version of the string replacing hyphens with space
@param delete_word_hyphens: add version of the string with hyphens deleted
@param replace_numeric_hyphens: add version of the string with numeric hyphens replaced
e.g. 12345-6789 => 12345 6789
@param delete_numeric_hyphens: add version of the string with numeric hyphens removed
e.g. 12345-6789 => 123456789
@param split_alpha_from_numeric: split tokens like CR17 into CR 17, helps with expansion
of certain types of highway abbreviations
@param delete_final_periods: remove final periods on abbreviations e.g. St. => St
@param delete_acronym_periods: remove periods in acronyms e.g. U.S.A. => USA
@param drop_english_possessives: normalize possessives e.g. Mark's => Marks
@param delete_apostrophes: delete other types of hyphens e.g. O'Malley => OMalley
@param expand_numex: converts numeric expressions e.g. Twenty sixth => 26th,
using either the supplied languages or the result of
automated language classification.
@param roman_numerals: normalize Roman numerals e.g. IX => 9. Since these can be
ambiguous (especially I and V), turning this on simply
adds another version of the string if any potential
Roman numerals are found.
"""
address = safe_decode(address, 'utf-8')
return _expand.expand_address(address, languages=languages, **kw)

Exemple #4

0

Afficher le fichier

Fichier : expand.py Projet : Datactuariat/pypostal

def expand_address(address, languages=None, **kw):
"""
Expand the given address into one or more normalized strings.

Required
--------
@param address: the address as either Unicode or a UTF-8 encoded string

Options
-------
@param languages: a tuple or list of ISO language code strings (e.g. "en", "fr", "de", etc.)
to use in expansion. If None is passed, use language classifier
to detect language automatically.
@param address_components: an integer (bit-set) of address component expansions
to use e.g. ADDRESS_NAME | ADDRESS_STREET would use
only expansions which apply to venue names or streets.
@param latin_ascii: use the Latin to ASCII transliterator, which normalizes e.g. æ => ae
@param transliterate: use any available transliterators for non-Latin scripts, e.g.
for the Greek phrase διαφορετικούς becomes diaphoretikoús̱
@param strip_accents: strip accented characters e.g. é => e, ç => c. This loses some
information in various languags, but in general we want
@param decompose: perform Unicode normalization (NFD form)
@param lowercase: UTF-8 lowercase the string
@param trim_string: trim spaces on either side of the string
@param replace_word_hyphens: add version of the string replacing hyphens with space
@param delete_word_hyphens: add version of the string with hyphens deleted
@param replace_numeric_hyphens: add version of the string with numeric hyphens replaced
e.g. 12345-6789 => 12345 6789
@param delete_numeric_hyphens: add version of the string with numeric hyphens removed
e.g. 12345-6789 => 123456789
@param split_alpha_from_numeric: split tokens like CR17 into CR 17, helps with expansion
of certain types of highway abbreviations
@param delete_final_periods: remove final periods on abbreviations e.g. St. => St
@param delete_acronym_periods: remove periods in acronyms e.g. U.S.A. => USA
@param drop_english_possessives: normalize possessives e.g. Mark's => Marks
@param delete_apostrophes: delete other types of hyphens e.g. O'Malley => OMalley
@param expand_numex: converts numeric expressions e.g. Twenty sixth => 26th,
using either the supplied languages or the result of
automated language classification.
@param roman_numerals: normalize Roman numerals e.g. IX => 9. Since these can be
ambiguous (especially I and V), turning this on simply
adds another version of the string if any potential
Roman numerals are found.
"""
address = safe_decode(address, 'utf-8')
return _expand.expand_address(address, languages=languages, **kw)

Exemple #5

0

Afficher le fichier

Fichier : normalize.py Projet : subhashreehazra/parse

def normalize_string(s, string_options=DEFAULT_STRING_OPTIONS, languages=None):
    s = safe_decode(s)
    return _normalize.normalize_string(s, string_options, languages=languages)

Exemple #6

0

Afficher le fichier

Fichier : normalize.py Projet : myles/pypostal

def normalize_string(s, string_options=DEFAULT_STRING_OPTIONS):
    s = safe_decode(s)
    return _normalize.normalize_string(s, string_options)

Exemple #7

0

Afficher le fichier

def tokenize(s, whitespace=False):
    u = safe_decode(s)
    s = safe_encode(s)
    return [(safe_decode(s[start:start + length]),
             token_types.from_id(token_type))
            for start, length, token_type in _tokenize.tokenize(u, whitespace)]