def normalized_tokens(s, string_options=DEFAULT_STRING_OPTIONS, token_options=DEFAULT_TOKEN_OPTIONS, strip_parentheticals=True): ''' Normalizes a string, tokenizes, and normalizes each token with string and token-level options. This version only uses libpostal's deterministic normalizations i.e. methods with a single output. The string tree version will return multiple normalized strings, each with tokens. Usage: normalized_tokens(u'St.-Barthélemy') ''' s = safe_decode(s) if string_options & _normalize.NORMALIZE_STRING_LATIN_ASCII: normalized = _normalize.normalize_string_latin(s, string_options) else: normalized = _normalize.normalize_string_utf8(s, string_options) # Tuples of (offset, len, type) raw_tokens = tokenize_raw(normalized) tokens = [(_normalize.normalize_token(normalized, t, token_options), token_types.from_id(t[-1])) for t in raw_tokens] if strip_parentheticals: return remove_parens(tokens) else: return tokens
def strip_component(self, value, tagged=False): if not tagged: comma = token_types.COMMA.value hyphen = token_types.HYPHEN.value start = end = 0 tokens = tokenize_raw(value.strip()) for token_start, token_length, token_type in tokens: start = token_start if token_type not in (comma, hyphen): break else: start = token_start + token_length for token_start, token_length, token_type in reversed(tokens): end = token_start + token_length if token_type not in (comma, hyphen): break else: end = token_start return value[start:end] else: start = end = 0 tokens = value.split() separator_tag = self.separator_tag for i, t in enumerate(tokens): t, c = t.rsplit('/', 1) start = i if c != separator_tag: break else: start = i + 1 num_tokens = len(tokens) for j, t in enumerate(reversed(tokens)): t, c = t.rsplit('/', 1) end = num_tokens - j if c != separator_tag: break else: end = num_tokens - j - 1 return u' '.join(tokens[start:end])