def build_email_re(tldlist=None): if tldlist is None: tldlist = get_IANA_TLD_list() reg = r"(?=.{0,64}\@)" # limit userpart to 64 chars reg += r"(?<![a-z0-9!#$%&'*+\/=?^_`{|}~-])" # start boundary reg += r"(" # capture email reg += r"[a-z0-9!#$%&'*+\/=?^_`{|}~-]+" # no dot in beginning reg += r"(?:\.[a-z0-9!#$%&'*+\/=?^_`{|}~-]+)*" # no consecutive dots, no ending dot reg += r"\@" reg += r"[-a-z0-9._]+\." # hostname reg += r"(?:" # tldgroup reg += r"|".join([x.replace('.', '\.') for x in tldlist]) reg += r")" reg += r")(?!(?:[a-z0-9-]|\.[a-z0-9]))" # make sure domain ends here return re.compile(reg, re.IGNORECASE)
def build_email_re(tldlist=None): if tldlist is None: tldlist = get_IANA_TLD_list() reg = r"(?=.{0,64}\@)" # limit userpart to 64 chars reg += r"(?<![a-z0-9!#$%&'*+\/=?^_`{|}~-])" # start boundary reg += r"(" # capture email reg += r"[a-z0-9!#$%&'*+\/=?^_`{|}~-]+" # no dot in beginning reg += r"(?:\.[a-z0-9!#$%&'*+\/=?^_`{|}~-]+)*" # no consecutive dots, no ending dot reg += r"\@" reg += r"[-a-z0-9._]+\." # hostname reg += r"(?:" # tldgroup reg += "|".join([x.replace('.', '\.') for x in tldlist]) reg += r")" reg += r")(?!(?:[a-z0-9-]|\.[a-z0-9]))" # make sure domain ends here return re.compile(reg, re.IGNORECASE)
def build_search_re(tldlist=None): if tldlist is None: tldlist = get_IANA_TLD_list() # lookbehind to check for start of url # start with # - start of string # - whitespace # - " for href # - ' for borked href # - > for links in tags # - ) after closing parentheses (seen in chinese spam) # - * seen in spam # - - seen in spam reg = r"(?:(?<=^)|(?<=" reg += r"(?:\s|[\"'\>\)\*-])" reg += r"))" # url starts here reg += r"(?:" reg += r"(?:https?://|ftp://)" # protocol reg += r"(?:[a-z0-9!%_$]+(?::[a-z0-9!%_$]+)?@)?" # username/pw reg += r")?" # domain reg += r"(?:" # domain types # standard domain allowed_hostname_chars = r"-a-z0-9_" reg += r"[a-z0-9_]" # first char can't be a hyphen reg += r"[" + allowed_hostname_chars + \ r"]*" # there are domains with only one character, like 'x.org' reg += r"(?:\.[" + allowed_hostname_chars + \ r"]+)*" # more hostname parts separated by dot reg += r"\." # dot between hostname and tld reg += r"(?:" # tldgroup reg += r"|".join([x.replace('.', '\.') for x in tldlist]) reg += r")\.?" # standard domain can end with a dot # dotquad reg += r"|%s" % REGEX_IPV4 # ip6 reg += r"|\[%s\]" % REGEX_IPV6 reg += r")" # end of domain types # optional port reg += r"(?:\:\d{1,5})?" # after the domain, there must be a path sep or quotes space or ? end, # check with lookahead reg += r"""(?=["'/?]|\s|$)""" # path allowed_path_chars = r"-a-z0-9._/%#\[\]~*" reg += r"(?:\/[" + allowed_path_chars + r"]+)*" # request params allowed_param_chars = r"-a-z0-9;._/\[\]?#+%&=@*" reg += r"(?:\/?)" # end domain with optional slash reg += r"(?:\?[" + allowed_param_chars + \ r"]*)?" # params must follow after a question mark # print "RE: %s"%reg return re.compile(reg, re.IGNORECASE)
def build_search_re(tldlist=None): if tldlist is None: tldlist = get_IANA_TLD_list() # lookbehind to check for start of url # start with # - start of string # - whitespace # - " for href # - > for links in tags # - ) after closing parentheses (seen in chinese spam) # - * seen in spam reg = r"(?:(?<=^)|(?<=" reg += r"(?:\s|[\"\>\)\*])" reg += "))" # url starts here reg += r"(?:" reg += r"(?:https?://|ftp://)" # protocol reg += r"(?:[a-z0-9!%_$]+(?::[a-z0-9!%_$]+)?@)?" # username/pw reg += ")?" # domain reg += r"(?:" # domain types # standard domain allowed_hostname_chars = r"-a-z0-9_" reg += r"[a-z0-9_]" # first char can't be a hyphen reg += r"[" + allowed_hostname_chars + \ "]*" # there are domains with only one character, like 'x.org' reg += r"(?:\.[" + allowed_hostname_chars + \ "]+)*" # more hostname parts separated by dot reg += "\." # dot between hostname and tld reg += r"(?:" # tldgroup reg += "|".join([x.replace('.', '\.') for x in tldlist]) reg += r")\.?" # standard domain can end with a dot # dotquad reg += r"|%s" % REGEX_IPV4 # ip6 reg += r"|\[%s\]" % REGEX_IPV6 reg += r")" # end of domain types # optional port reg += r"(?:\:\d{1,5})?" # after the domain, there must be a path sep or quotes space or ? end, # check with lookahead reg += r"""(?=["'/?]|\s|$)""" # path allowed_path_chars = r"-a-z0-9._/%#\[\]~" reg += "(?:\/[" + allowed_path_chars + "]+)*" # request params allowed_param_chars = r"-a-z0-9;._/\[\]?#+%&=@" reg += r"(?:\/?)" # end domain with optional slash reg += "(?:\?[" + allowed_param_chars + \ "]*)?" # params must follow after a question mark # print "RE: %s"%reg return re.compile(reg, re.IGNORECASE)