Esempio n. 1
0
def normalized_tokens(s,
                      string_options=DEFAULT_STRING_OPTIONS,
                      token_options=DEFAULT_TOKEN_OPTIONS,
                      strip_parentheticals=True,
                      whitespace=False,
                      languages=None):
    '''
    Normalizes a string, tokenizes, and normalizes each token
    with string and token-level options.

    This version only uses libpostal's deterministic normalizations
    i.e. methods with a single output. The string tree version will
    return multiple normalized strings, each with tokens.

    Usage:
        normalized_tokens(u'St.-Barthélemy')
    '''
    s = safe_decode(s)
    normalized_tokens = _normalize.normalized_tokens(s,
                                                     string_options,
                                                     token_options,
                                                     whitespace,
                                                     languages=languages)

    if strip_parentheticals:
        normalized_tokens = remove_parens(normalized_tokens)

    return [(s, token_types.from_id(token_type))
            for s, token_type in normalized_tokens]
Esempio n. 2
0
def parse_address(address, language=None, country=None):
    """
    Parse address into components.

    @param address: the address as either Unicode or a UTF-8 encoded string
    @param language (optional): language code
    @param country (optional): country code
    """
    address = safe_decode(address, 'utf-8')
    return _parser.parse_address(address, language=language, country=country)
Esempio n. 3
0
def expand_address(address, languages=None, **kw):
    """
    Expand the given address into one or more normalized strings.

    Required
    --------
    @param address: the address as either Unicode or a UTF-8 encoded string

    Options
    -------
    @param languages: a tuple or list of ISO language code strings (e.g. "en", "fr", "de", etc.)
                      to use in expansion. If None is passed, use language classifier
                      to detect language automatically.
    @param address_components: an integer (bit-set) of address component expansions
                               to use e.g. ADDRESS_NAME | ADDRESS_STREET would use
                               only expansions which apply to venue names or streets.
    @param latin_ascii: use the Latin to ASCII transliterator, which normalizes e.g. æ => ae
    @param transliterate: use any available transliterators for non-Latin scripts, e.g.
                          for the Greek phrase διαφορετικούς becomes diaphoretikoús̱
    @param strip_accents: strip accented characters e.g. é => e, ç => c. This loses some
                          information in various languags, but in general we want
    @param decompose: perform Unicode normalization (NFD form)
    @param lowercase: UTF-8 lowercase the string
    @param trim_string: trim spaces on either side of the string
    @param replace_word_hyphens: add version of the string replacing hyphens with space
    @param delete_word_hyphens: add version of the string with hyphens deleted
    @param replace_numeric_hyphens: add version of the string with numeric hyphens replaced 
                                    e.g. 12345-6789 => 12345 6789
    @param delete_numeric_hyphens: add version of the string with numeric hyphens removed
                                   e.g. 12345-6789 => 123456789
    @param split_alpha_from_numeric: split tokens like CR17 into CR 17, helps with expansion
                                     of certain types of highway abbreviations
    @param delete_final_periods: remove final periods on abbreviations e.g. St. => St
    @param delete_acronym_periods: remove periods in acronyms e.g. U.S.A. => USA
    @param drop_english_possessives: normalize possessives e.g. Mark's => Marks
    @param delete_apostrophes: delete other types of hyphens e.g. O'Malley => OMalley
    @param expand_numex: converts numeric expressions e.g. Twenty sixth => 26th,
                         using either the supplied languages or the result of
                         automated language classification.
    @param roman_numerals: normalize Roman numerals e.g. IX => 9. Since these can be
                           ambiguous (especially I and V), turning this on simply
                           adds another version of the string if any potential
                           Roman numerals are found.
    """
    address = safe_decode(address, 'utf-8')
    return _expand.expand_address(address, languages=languages, **kw)
Esempio n. 4
0
def expand_address(address, languages=None, **kw):
    """
    Expand the given address into one or more normalized strings.

    Required
    --------
    @param address: the address as either Unicode or a UTF-8 encoded string

    Options
    -------
    @param languages: a tuple or list of ISO language code strings (e.g. "en", "fr", "de", etc.)
                      to use in expansion. If None is passed, use language classifier
                      to detect language automatically.
    @param address_components: an integer (bit-set) of address component expansions
                               to use e.g. ADDRESS_NAME | ADDRESS_STREET would use
                               only expansions which apply to venue names or streets.
    @param latin_ascii: use the Latin to ASCII transliterator, which normalizes e.g. æ => ae
    @param transliterate: use any available transliterators for non-Latin scripts, e.g.
                          for the Greek phrase διαφορετικούς becomes diaphoretikoús̱
    @param strip_accents: strip accented characters e.g. é => e, ç => c. This loses some
                          information in various languags, but in general we want
    @param decompose: perform Unicode normalization (NFD form)
    @param lowercase: UTF-8 lowercase the string
    @param trim_string: trim spaces on either side of the string
    @param replace_word_hyphens: add version of the string replacing hyphens with space
    @param delete_word_hyphens: add version of the string with hyphens deleted
    @param replace_numeric_hyphens: add version of the string with numeric hyphens replaced 
                                    e.g. 12345-6789 => 12345 6789
    @param delete_numeric_hyphens: add version of the string with numeric hyphens removed
                                   e.g. 12345-6789 => 123456789
    @param split_alpha_from_numeric: split tokens like CR17 into CR 17, helps with expansion
                                     of certain types of highway abbreviations
    @param delete_final_periods: remove final periods on abbreviations e.g. St. => St
    @param delete_acronym_periods: remove periods in acronyms e.g. U.S.A. => USA
    @param drop_english_possessives: normalize possessives e.g. Mark's => Marks
    @param delete_apostrophes: delete other types of hyphens e.g. O'Malley => OMalley
    @param expand_numex: converts numeric expressions e.g. Twenty sixth => 26th,
                         using either the supplied languages or the result of
                         automated language classification.
    @param roman_numerals: normalize Roman numerals e.g. IX => 9. Since these can be
                           ambiguous (especially I and V), turning this on simply
                           adds another version of the string if any potential
                           Roman numerals are found.
    """
    address = safe_decode(address, 'utf-8')
    return _expand.expand_address(address, languages=languages, **kw)
Esempio n. 5
0
def normalize_string(s, string_options=DEFAULT_STRING_OPTIONS, languages=None):
    s = safe_decode(s)
    return _normalize.normalize_string(s, string_options, languages=languages)
Esempio n. 6
0
def normalize_string(s, string_options=DEFAULT_STRING_OPTIONS):
    s = safe_decode(s)
    return _normalize.normalize_string(s, string_options)
Esempio n. 7
0
def tokenize(s, whitespace=False):
    u = safe_decode(s)
    s = safe_encode(s)
    return [(safe_decode(s[start:start + length]),
             token_types.from_id(token_type))
            for start, length, token_type in _tokenize.tokenize(u, whitespace)]