def normalized_tokens(s, string_options=DEFAULT_STRING_OPTIONS, token_options=DEFAULT_TOKEN_OPTIONS, strip_parentheticals=True, whitespace=False, languages=None): ''' Normalizes a string, tokenizes, and normalizes each token with string and token-level options. This version only uses libpostal's deterministic normalizations i.e. methods with a single output. The string tree version will return multiple normalized strings, each with tokens. Usage: normalized_tokens(u'St.-Barthélemy') ''' s = safe_decode(s) normalized_tokens = _normalize.normalized_tokens(s, string_options, token_options, whitespace, languages=languages) if strip_parentheticals: normalized_tokens = remove_parens(normalized_tokens) return [(s, token_types.from_id(token_type)) for s, token_type in normalized_tokens]
def parse_address(address, language=None, country=None): """ Parse address into components. @param address: the address as either Unicode or a UTF-8 encoded string @param language (optional): language code @param country (optional): country code """ address = safe_decode(address, 'utf-8') return _parser.parse_address(address, language=language, country=country)
def expand_address(address, languages=None, **kw): """ Expand the given address into one or more normalized strings. Required -------- @param address: the address as either Unicode or a UTF-8 encoded string Options ------- @param languages: a tuple or list of ISO language code strings (e.g. "en", "fr", "de", etc.) to use in expansion. If None is passed, use language classifier to detect language automatically. @param address_components: an integer (bit-set) of address component expansions to use e.g. ADDRESS_NAME | ADDRESS_STREET would use only expansions which apply to venue names or streets. @param latin_ascii: use the Latin to ASCII transliterator, which normalizes e.g. æ => ae @param transliterate: use any available transliterators for non-Latin scripts, e.g. for the Greek phrase διαφορετικούς becomes diaphoretikoús̱ @param strip_accents: strip accented characters e.g. é => e, ç => c. This loses some information in various languags, but in general we want @param decompose: perform Unicode normalization (NFD form) @param lowercase: UTF-8 lowercase the string @param trim_string: trim spaces on either side of the string @param replace_word_hyphens: add version of the string replacing hyphens with space @param delete_word_hyphens: add version of the string with hyphens deleted @param replace_numeric_hyphens: add version of the string with numeric hyphens replaced e.g. 12345-6789 => 12345 6789 @param delete_numeric_hyphens: add version of the string with numeric hyphens removed e.g. 12345-6789 => 123456789 @param split_alpha_from_numeric: split tokens like CR17 into CR 17, helps with expansion of certain types of highway abbreviations @param delete_final_periods: remove final periods on abbreviations e.g. St. => St @param delete_acronym_periods: remove periods in acronyms e.g. U.S.A. => USA @param drop_english_possessives: normalize possessives e.g. Mark's => Marks @param delete_apostrophes: delete other types of hyphens e.g. O'Malley => OMalley @param expand_numex: converts numeric expressions e.g. Twenty sixth => 26th, using either the supplied languages or the result of automated language classification. @param roman_numerals: normalize Roman numerals e.g. IX => 9. Since these can be ambiguous (especially I and V), turning this on simply adds another version of the string if any potential Roman numerals are found. """ address = safe_decode(address, 'utf-8') return _expand.expand_address(address, languages=languages, **kw)
def normalize_string(s, string_options=DEFAULT_STRING_OPTIONS, languages=None): s = safe_decode(s) return _normalize.normalize_string(s, string_options, languages=languages)
def normalize_string(s, string_options=DEFAULT_STRING_OPTIONS): s = safe_decode(s) return _normalize.normalize_string(s, string_options)
def tokenize(s, whitespace=False): u = safe_decode(s) s = safe_encode(s) return [(safe_decode(s[start:start + length]), token_types.from_id(token_type)) for start, length, token_type in _tokenize.tokenize(u, whitespace)]