def seo_tokenize(title, lower=True): """Get SEO-tokenized version of a string, typically a name or title `title` the string to tokenize `lower` whether to lowercase the string e.g. <- "The World's Greatest Establishment" -> 'the-worlds-greatest-establishment' <- 'Recreational Sports Facility, Berkeley, CA', lower=False -> 'Recreational-Sports-Facility-Berkeley-CA' """ cleaned_title = title.strip() try: cleaned_title = unicode_to_ascii(cleaned_title) except: pass if lower: cleaned_title = cleaned_title.lower() else: pass # allow only spaces, alpha-numeric cleaned_title = re.sub('[^ A-Za-z0-9]', '', cleaned_title) tokenized_title = '-'.join(cleaned_title.split()) return tokenized_title
def seo_tokenize(title, lower=True): """Get SEO-tokenized version of a string, typically a name or title `title` the string to tokenize `lower` whether to lowercase the string e.g. <- "The World's Greatest Establishment" -> 'the-worlds-greatest-establishment' <- 'Recreational Sports Facility, Berkeley, CA', lower=False -> 'Recreational-Sports-Facility-Berkeley-CA' """ cleaned_title = title.strip() try: cleaned_title = unicode_to_ascii(cleaned_title) except: pass if lower: cleaned_title = cleaned_title.lower() else: pass # allow only spaces, hyphens, alpha-numeric cleaned_title = re.sub('[^ \-A-Za-z0-9]', '', cleaned_title) tokenized_title = '-'.join(cleaned_title.split()) return tokenized_title
def seo_tokenize(title, lower=True, preserve_ascii_extended=False, preserve_unicode=False): """Get SEO-tokenized version of a string, typically a name or title `title` the string to tokenize `lower` whether to lowercase the string `preserve_ascii_extended` will preserve extended ASCII characters if `True` `preserve_unicode` will preserve Unicode if `True` e.g. <- "The World's Greatest Establishment" -> 'the-worlds-greatest-establishment' <- 'Recreational Sports Facility, Berkeley, CA', lower=False -> 'Recreational-Sports-Facility-Berkeley-CA' """ cleaned_title = title.strip() try: if preserve_ascii_extended or preserve_unicode: # do nothing, keep extended ASCII and Unicode in title pass else: cleaned_title = unicode_to_ascii(cleaned_title) except: pass if lower: cleaned_title = cleaned_title.lower() else: pass def _repl(matchobj): c = matchobj.group(0) if is_ascii_extended(c): replaced_c = c if preserve_ascii_extended else '' elif is_ascii(c): # it is ASCII, but not one of the accepted ASCII characters replaced_c = '' else: replaced_c = c if preserve_unicode else '' return replaced_c cleaned_title = re.sub(r'[^ \-A-Za-z0-9]', _repl, cleaned_title) # replace whitespace in string with hyphens tokenized_title = '-'.join(cleaned_title.split()) return tokenized_title
def ssml_sanitized(s): sanitized = re.sub(r'&', 'and', unicode_to_ascii(s)) return sanitized