def test_flag_ignorecase(self): rgx_ci = cffi_re2.compile(r'a(b+)$', flags=cffi_re2.IGNORECASE) rgx_cs = cffi_re2.compile(r'a(b+)$') # Check case sensitive assert_is_none(rgx_cs.match("AB")) assert_is_none(rgx_cs.match("Ab")) assert_is_none(rgx_cs.match("aB")) assert_is_none(rgx_cs.match("aBb")) assert_is_none(rgx_cs.match("abB")) assert_is_not_none(rgx_cs.match("ab")) assert_is_not_none(rgx_cs.match("abb")) # Check case insensitive assert_is_not_none(rgx_ci.match("AB")) assert_is_not_none(rgx_ci.match("Ab")) assert_is_not_none(rgx_ci.match("aB")) assert_is_not_none(rgx_ci.match("aBb")) assert_is_not_none(rgx_ci.match("abB")) assert_is_not_none(rgx_ci.match("ab")) assert_is_not_none(rgx_ci.match("abb")) # Official example assert_equal( cffi_re2.sub(r'\sAND\s', ' & ', 'Baked Beans And Spam', flags=cffi_re2.IGNORECASE), 'Baked Beans & Spam')
def test_basic_match(self): # Search-type regex should NOT match full string robj = cffi_re2.compile(r'b+') assert_is_none(robj.match('abbcd')) # This regex only matches the left end robj = cffi_re2.compile(r'[abc]+$') assert_is_none(robj.match('abbcd')) # Full match regex should match robj = cffi_re2.compile(r'[abcd]+') assert_is_not_none(robj.match('abbcd')) # Regex match should be left-anchored, not both-anchored robj = cffi_re2.compile(r'a+') assert_is_not_none(robj.match('aaab')) assert_is_none(robj.match('baaab'))
def test_medium_complexity(self): """Check medium complexity regexes""" # Examples from github.com/ulikoehler/KATranslationCheck # 1 rgx = cffi_re2.compile(r"\b[Ii]nto\b") assert_is_not_none(rgx.search("Into the darkness")) assert_is_not_none(rgx.search("I went into the darkness")) assert_is_none(rgx.search("abcde beintoaqe aqet")) # 2 rgx = cffi_re2.compile(r"\d+\$\s*dollars?") assert_is_not_none(rgx.search("12$ dollars")) assert_is_not_none(rgx.match("12$ dollars")) assert_is_not_none(rgx.match("1$ dollar")) assert_is_not_none(rgx.match("1$ dollar")) assert_is_not_none(rgx.match("1$ dollars"))
def using_cffi_re2(text): ''' It using cffi - https://github.com/vls/cffi_re2 ''' pattern = cffi_re2.compile( ("([a-z0-9!#$%&'*+\/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+\/=?^_`" "{|}~-]+)*(@|\sat\s))")) return pattern.findall(text)
def compile(self, rgx, flags=0): rgx = "({0})".format(rgx) self.numRegex += 1 try: return cffi_re2.compile(rgx, flags) except ValueError: # Enable this for debugging # print("Regex in compatibility mode: {0}".format(rgx)) self.numCompatRegex += 1 return re.compile(rgx, flags)
def contains(v, regex): """Removes meta data from regex then checks for a regex match """ if six.PY3 and isinstance(v, bytes): v = v.decode() try: return cffi_re2.compile(regex.split('\\;')[0], flags=cffi_re2.IGNORECASE).search(v) except Exception as E: print(str(E)) print(regex) #print(str(v)) return re.compile(regex.split('\\;')[0], flags=re.IGNORECASE).search(v)
def test_flag_ignorecase(self): rgx_ci = cffi_re2.compile(r'a(b+)$', flags=cffi_re2.IGNORECASE) rgx_cs = cffi_re2.compile(r'a(b+)$') # Check case sensitive assert_is_none(rgx_cs.match("AB")) assert_is_none(rgx_cs.match("Ab")) assert_is_none(rgx_cs.match("aB")) assert_is_none(rgx_cs.match("aBb")) assert_is_none(rgx_cs.match("abB")) assert_is_not_none(rgx_cs.match("ab")) assert_is_not_none(rgx_cs.match("abb")) # Check case insensitive assert_is_not_none(rgx_ci.match("AB")) assert_is_not_none(rgx_ci.match("Ab")) assert_is_not_none(rgx_ci.match("aB")) assert_is_not_none(rgx_ci.match("aBb")) assert_is_not_none(rgx_ci.match("abB")) assert_is_not_none(rgx_ci.match("ab")) assert_is_not_none(rgx_ci.match("abb")) # Official example assert_equal(cffi_re2.sub(r'\sAND\s', ' & ', 'Baked Beans And Spam', flags=cffi_re2.IGNORECASE), 'Baked Beans & Spam')
def __init__(self, lang): self.lang = lang self.autotrans = RuleAutotranslator() # Preindex filter # Used to avoid indexing patterns with one instance self.preindex_ctr = Counter() # norm engl hash => count self.preindex_min_count = 2 # minimum instances to be considered a pattern self.preindex_set = set( ) # Compiled from preindex_ctr in clean_preindex() self.index = Counter() # norm engl => count self.untranslated_index = Counter() # norm engl => count self.translated_index = defaultdict( Counter) # norm engl => translation => count self.filename_index = defaultdict( Counter) # norm_engl => {filename: count} self._formula_re = re.compile(r"\$[^\$]+\$") self._img_re = get_image_regex() self._text = get_text_content_regex() self._transURLs = {} # Translation URL examples # NOTE: Need to run indexer TWO TIMES to get accurate results # as the text tags first need to be updated to get an accurate IF index self.texttags = read_texttag_index(lang)
def test_invalid_regex(self): p = '(?!=.*[没不])' robj = cffi_re2.compile(p)
def test_match_chinese(self): robj = cffi_re2.compile('梦[^一-龥]*幻[^一-龥]*西[^一-龥]*游') assert_true(robj.search('梦1幻2西3游')) assert_false(robj.search('梦倩女幻幽魂西2游'))
def get_input_re(): return re.compile(r"\[\[☃\s+[a-z-]+\s*\d*\]\]")
def get_formula_re(): return re.compile(r"\$[^\$]+\$")
def get_start_invariant_regex(): return re.compile(r"^((>|\s+|-|\\n)*)\s*", re.UNICODE)
def test_basic_findall(self): robj = cffi_re2.compile(r'a(b+)') mo = robj.findall("abbcdefabbbbca") assert_is_not_none(mo) assert_equal(mo, ["bb", "bbbb"])
def test_basic_groups(self): robj = cffi_re2.compile(r'a(b+)') mo = robj.search("abbc") assert_is_not_none(mo) assert_equal(mo.groups(), ("bb",))
def test_sub_basic(self): robj = cffi_re2.compile(r'b+') assert_equal(robj.sub('', 'abbcbbd'), 'acd')
def test_sub_chinese(self): robj = cffi_re2.compile('梦[^一-龥]*幻[^一-龥]*西[^一-龥]*游') assert_equal(robj.sub('倩女', '梦幻西游好玩吗?'), u'倩女好玩吗?')
for i in range(iteration): email_list = extract_emails(df_html["html"], df_html["url"], reg) end_time = time.time() total_time = end_time-start_time python_engine_list.append(total_time) print("total time (in seconds) for " + str(iteration) + " is ", end_time-start_time) return email_list, python_engine_list if __name__ == "__main__": # confirms that the code is under main function df_html = pd.read_csv("/home/ubuntu/server_files/us_fda_raw_html.csv") iteration_list = [10,20,40,80,160,320,640] reg = re.compile("([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)") print("profiling Python 3 regex engine\n") email_list_py, python_engine_list = profile_email_regex(reg, iteration_list, df_html) print("profiling re2 regex engine\n") reg = cffi_re2.compile("([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)") email_list_re2, re2_engine_list = profile_email_regex(reg, iteration_list, df_html) df_emails_re2 = pd.DataFrame(email_list_re2) df_emails_re2.to_csv("/home/ubuntu/server_files/emails_re2.csv") df_emails_py = pd.DataFrame(email_list_py) df_emails_py.to_csv("/home/ubuntu/server_files/emails_py.csv") df_profile = pd.DataFrame({"iteration_no":iteration_list, "python_engine_time": python_engine_list, "re2_engine_time": re2_engine_list}) df_profile.to_csv("/home/ubuntu/server_files/profile.csv")
#!/usr/bin/env python3 import argparse import os.path import json from check import readPOFiles import cffi_re2 as re2 import simplejson as json imageRegex = re2.compile(r"https?://ka-perseus-(images|graphie)\.s3\.amazonaws.com/([a-z0-9]+)\.(jpeg|jpg|png)") graphieRegex = re2.compile(r"web\+graphie://ka-perseus-graphie\.s3\.amazonaws.com/([a-z0-9]+)") images = set() graphie = set() def findInPO(po): for entry in po: engl = entry.msgid trans = entry.msgstr for hit in imageRegex.findall(engl) + imageRegex.findall(trans): images.add("{}.{}".format(hit[1], hit[2])) for hit in graphieRegex.findall(engl) + graphieRegex.findall(trans): graphie.add(hit) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-l', '--language', default="de", help='The language to use') args = parser.parse_args() po = readPOFiles(os.path.join("cache", args.language))
def get_end_invariant_regex(): # Apply to reversed string return re.compile(r"^((n\\|[\.\?,!\s]+|\]\]\d*\s*[a-z-]+\s+☃\s*\[\[)*)\s*", re.UNICODE)
def test_match_chinese(): robj = cffi_re2.compile('梦[^一-龥]*幻[^一-龥]*西[^一-龥]*游') assert robj.search('梦1幻2西3游') assert not robj.search('梦倩女幻幽魂西2游')
def get_text_content_regex(): return re.compile(r"(\\text\s*\{\s*)([^\}]+?)(\s*\})")
def test_sub_basic(): robj = cffi_re2.compile('b+') assert robj.sub('', 'abbcbbd') == 'acd'
def get_image_regex(): return re.compile( r"((!\[([^\]]+)?\]\()?\s*(http|https|web\+graphie):\/\/(ka-perseus-(images|graphie)\.s3\.amazonaws\.com|fastly\.kastatic\.org\/ka-perseus-graphie)\/[0-9a-f]+(\.(svg|png|jpg))?\)?)" )
def test_sub_chinese(): robj = cffi_re2.compile('梦[^一-龥]*幻[^一-龥]*西[^一-龥]*游') assert robj.sub('倩女', '梦幻西游好玩吗?') == '倩女好玩吗?'
def test_basic_search(self): robj = cffi_re2.compile(r'b+') assert_is_not_none(robj.search('abbcd'))
def test_invalid_regex(): p = '(?!=.*[没不])' robj = cffi_re2.compile(p)
#!/usr/bin/env python3 import argparse import os.path import json from check import readPOFiles try: import cffi_re2 as re2 except ImportError: import re as re2 import simplejson as json imageRegex = re2.compile( r"https?://ka-perseus-(images|graphie)\.s3\.amazonaws.com/([a-z0-9]+)\.(jpeg|jpg|png)" ) graphieRegex = re2.compile( r"web\+graphie://ka-perseus-graphie\.s3\.amazonaws.com/([a-z0-9]+)") images = set() graphie = set() def findInPO(po): for entry in po: engl = entry.msgid trans = entry.msgstr for hit in imageRegex.findall(engl) + imageRegex.findall(trans): images.add("{}.{}".format(hit[1], hit[2])) for hit in graphieRegex.findall(engl) + graphieRegex.findall(trans): graphie.add(hit)
def test_match_basic(): robj = cffi_re2.compile('b+') flag = robj.search('abbcd') assert flag
def __init__(self): self.index = Counter() self.translated_index = {} self.autotranslator = RuleAutotranslator() self._re = re.compile(r"\d")
def test_invalid_regex_2(self): p = '(?<![没不])' robj = cffi_re2.compile(p)
def read_texttag_index(lang): try: texttags = read_patterns(lang, "texttags") return { v["english"]: v["translated"] for v in texttags # Ignore empty string == untranslated if (v["translated"] or ( v["english"] == "" and v["translated"] == "")) } except FileNotFoundError: return {} import re _numeric_only_re = re.compile(r"^\d+(\.\d+)?$", re.UNICODE) def is_numeric_only(s): if s is None: return False return _numeric_only_re.match(s) is not None def pattern_list_to_xliff(patterns): """ Convert a JSON list to a XLIFF soup """ # Read template XLIFF with open("template.xliff") as infile: soup = BeautifulSoup(infile, "lxml-xml")
def test_basic_groups(self): robj = cffi_re2.compile(r'a(b+)') mo = robj.search("abbc") assert_is_not_none(mo) assert_equal(mo.groups(), ("bb", ))
def get_text_regex(): exceptions = ["cm", "m", "g", "kg", "s", "min", "max", "h", "cm"] exc_clause = "".join([r"(?! ?" + ex + r"\})" for ex in exceptions]) regex = r"(\\(text|mathrm|textit|textbf)\s*\{" + exc_clause + r")" return re.compile(regex)
def test_invalid_regex_2(): p = '(?<![没不])' robj = cffi_re2.compile(p)