Exemple #1
0
def seo_tokenize(title, lower=True):
    """Get SEO-tokenized version of a string, typically a name or title

    `title` the string to tokenize
    `lower` whether to lowercase the string

    e.g.
    <- "The World's Greatest Establishment"
    -> 'the-worlds-greatest-establishment'

    <- 'Recreational Sports Facility, Berkeley, CA', lower=False
    -> 'Recreational-Sports-Facility-Berkeley-CA'
    """
    cleaned_title = title.strip()
    try:
        cleaned_title = unicode_to_ascii(cleaned_title)
    except:
        pass
    if lower:
        cleaned_title = cleaned_title.lower()
    else:
        pass
    # allow only spaces, alpha-numeric
    cleaned_title = re.sub('[^ A-Za-z0-9]', '', cleaned_title)
    tokenized_title = '-'.join(cleaned_title.split())
    return tokenized_title
Exemple #2
0
def seo_tokenize(title, lower=True):
    """Get SEO-tokenized version of a string, typically a name or title

    `title` the string to tokenize
    `lower` whether to lowercase the string

    e.g.
    <- "The World's Greatest Establishment"
    -> 'the-worlds-greatest-establishment'

    <- 'Recreational Sports Facility, Berkeley, CA', lower=False
    -> 'Recreational-Sports-Facility-Berkeley-CA'
    """
    cleaned_title = title.strip()
    try:
        cleaned_title = unicode_to_ascii(cleaned_title)
    except:
        pass
    if lower:
        cleaned_title = cleaned_title.lower()
    else:
        pass
    # allow only spaces, hyphens, alpha-numeric
    cleaned_title = re.sub('[^ \-A-Za-z0-9]', '', cleaned_title)
    tokenized_title = '-'.join(cleaned_title.split())
    return tokenized_title
def seo_tokenize(title,
                 lower=True,
                 preserve_ascii_extended=False,
                 preserve_unicode=False):
    """Get SEO-tokenized version of a string, typically a name or title

    `title` the string to tokenize
    `lower` whether to lowercase the string
    `preserve_ascii_extended` will preserve extended ASCII characters if `True`
    `preserve_unicode` will preserve Unicode if `True`
    e.g.
    <- "The World's Greatest Establishment"
    -> 'the-worlds-greatest-establishment'

    <- 'Recreational Sports Facility, Berkeley, CA', lower=False
    -> 'Recreational-Sports-Facility-Berkeley-CA'
    """
    cleaned_title = title.strip()
    try:
        if preserve_ascii_extended or preserve_unicode:
            # do nothing, keep extended ASCII and Unicode in title
            pass
        else:
            cleaned_title = unicode_to_ascii(cleaned_title)
    except:
        pass

    if lower:
        cleaned_title = cleaned_title.lower()
    else:
        pass

    def _repl(matchobj):
        c = matchobj.group(0)
        if is_ascii_extended(c):
            replaced_c = c if preserve_ascii_extended else ''
        elif is_ascii(c):
            # it is ASCII, but not one of the accepted ASCII characters
            replaced_c = ''
        else:
            replaced_c = c if preserve_unicode else ''
        return replaced_c

    cleaned_title = re.sub(r'[^ \-A-Za-z0-9]', _repl, cleaned_title)

    # replace whitespace in string with hyphens
    tokenized_title = '-'.join(cleaned_title.split())

    return tokenized_title
Exemple #4
0
def ssml_sanitized(s):
    sanitized = re.sub(r'&', 'and', unicode_to_ascii(s))
    return sanitized