Example #1
0
def build_email_re(tldlist=None):
    if tldlist is None:
        tldlist = get_IANA_TLD_list()

    reg = r"(?=.{0,64}\@)"  # limit userpart to 64 chars
    reg += r"(?<![a-z0-9!#$%&'*+\/=?^_`{|}~-])"  # start boundary
    reg += r"("  # capture email
    reg += r"[a-z0-9!#$%&'*+\/=?^_`{|}~-]+"  # no dot in beginning
    reg += r"(?:\.[a-z0-9!#$%&'*+\/=?^_`{|}~-]+)*"  # no consecutive dots, no ending dot
    reg += r"\@"
    reg += r"[-a-z0-9._]+\."  # hostname
    reg += r"(?:"  # tldgroup
    reg += r"|".join([x.replace('.', '\.') for x in tldlist])
    reg += r")"
    reg += r")(?!(?:[a-z0-9-]|\.[a-z0-9]))"  # make sure domain ends here
    return re.compile(reg, re.IGNORECASE)
Example #2
0
def build_email_re(tldlist=None):
    if tldlist is None:
        tldlist = get_IANA_TLD_list()

    reg = r"(?=.{0,64}\@)"                         # limit userpart to 64 chars
    reg += r"(?<![a-z0-9!#$%&'*+\/=?^_`{|}~-])"     # start boundary
    reg += r"("                                             # capture email
    reg += r"[a-z0-9!#$%&'*+\/=?^_`{|}~-]+"         # no dot in beginning
    reg += r"(?:\.[a-z0-9!#$%&'*+\/=?^_`{|}~-]+)*"  # no consecutive dots, no ending dot
    reg += r"\@"
    reg += r"[-a-z0-9._]+\."  # hostname
    reg += r"(?:"  # tldgroup
    reg += "|".join([x.replace('.', '\.') for x in tldlist])
    reg += r")"
    reg += r")(?!(?:[a-z0-9-]|\.[a-z0-9]))"          # make sure domain ends here
    return re.compile(reg, re.IGNORECASE)
Example #3
0
def build_search_re(tldlist=None):
    if tldlist is None:
        tldlist = get_IANA_TLD_list()

    # lookbehind to check for start of url
    # start with
    # - start of string
    # - whitespace
    # - " for href
    # - ' for borked href
    # - > for links in tags
    # - ) after closing parentheses (seen in chinese spam)
    # - * seen in spam
    # - - seen in spam
    reg = r"(?:(?<=^)|(?<="
    reg += r"(?:\s|[\"'\>\)\*-])"
    reg += r"))"

    # url starts here
    reg += r"(?:"
    reg += r"(?:https?://|ftp://)"  # protocol
    reg += r"(?:[a-z0-9!%_$]+(?::[a-z0-9!%_$]+)?@)?"  # username/pw
    reg += r")?"

    # domain
    reg += r"(?:"  # domain types

    # standard domain
    allowed_hostname_chars = r"-a-z0-9_"
    reg += r"[a-z0-9_]"  # first char can't be a hyphen
    reg += r"[" + allowed_hostname_chars + \
        r"]*"  # there are domains with only one character, like 'x.org'
    reg += r"(?:\.[" + allowed_hostname_chars + \
        r"]+)*"  # more hostname parts separated by dot
    reg += r"\."  # dot between hostname and tld
    reg += r"(?:"  # tldgroup
    reg += r"|".join([x.replace('.', '\.') for x in tldlist])
    reg += r")\.?"  # standard domain can end with a dot

    # dotquad
    reg += r"|%s" % REGEX_IPV4

    # ip6
    reg += r"|\[%s\]" % REGEX_IPV6

    reg += r")"  # end of domain types

    # optional port
    reg += r"(?:\:\d{1,5})?"

    # after the domain, there must be a path sep or quotes space or ? end,
    # check with lookahead
    reg += r"""(?=["'/?]|\s|$)"""

    # path
    allowed_path_chars = r"-a-z0-9._/%#\[\]~*"
    reg += r"(?:\/[" + allowed_path_chars + r"]+)*"

    # request params
    allowed_param_chars = r"-a-z0-9;._/\[\]?#+%&=@*"
    reg += r"(?:\/?)"  # end domain with optional  slash
    reg += r"(?:\?[" + allowed_param_chars + \
        r"]*)?"  # params must follow after a question mark

    # print "RE: %s"%reg
    return re.compile(reg, re.IGNORECASE)
Example #4
0
def build_search_re(tldlist=None):
    if tldlist is None:
        tldlist = get_IANA_TLD_list()

    # lookbehind to check for start of url
    # start with
    # - start of string
    # - whitespace
    # - " for href
    # - > for links in tags
    # - ) after closing parentheses (seen in chinese spam)
    # - * seen in spam
    reg = r"(?:(?<=^)|(?<="
    reg += r"(?:\s|[\"\>\)\*])"
    reg += "))"

    # url starts here
    reg += r"(?:"
    reg += r"(?:https?://|ftp://)"  # protocol
    reg += r"(?:[a-z0-9!%_$]+(?::[a-z0-9!%_$]+)?@)?"  # username/pw
    reg += ")?"

    # domain
    reg += r"(?:"  # domain types

    # standard domain
    allowed_hostname_chars = r"-a-z0-9_"
    reg += r"[a-z0-9_]"  # first char can't be a hyphen
    reg += r"[" + allowed_hostname_chars + \
        "]*"  # there are domains with only one character, like 'x.org'
    reg += r"(?:\.[" + allowed_hostname_chars + \
        "]+)*"  # more hostname parts separated by dot
    reg += "\."  # dot between hostname and tld
    reg += r"(?:"  # tldgroup
    reg += "|".join([x.replace('.', '\.') for x in tldlist])
    reg += r")\.?"  # standard domain can end with a dot

    # dotquad
    reg += r"|%s" % REGEX_IPV4

    # ip6
    reg += r"|\[%s\]" % REGEX_IPV6

    reg += r")"  # end of domain types

    # optional port
    reg += r"(?:\:\d{1,5})?"

    # after the domain, there must be a path sep or quotes space or ? end,
    # check with lookahead
    reg += r"""(?=["'/?]|\s|$)"""

    # path
    allowed_path_chars = r"-a-z0-9._/%#\[\]~"
    reg += "(?:\/[" + allowed_path_chars + "]+)*"

    # request params
    allowed_param_chars = r"-a-z0-9;._/\[\]?#+%&=@"
    reg += r"(?:\/?)"  # end domain with optional  slash
    reg += "(?:\?[" + allowed_param_chars + \
        "]*)?"  # params must follow after a question mark

    # print "RE: %s"%reg
    return re.compile(reg, re.IGNORECASE)