예제 #1
0
 def test_flag_ignorecase(self):
     rgx_ci = cffi_re2.compile(r'a(b+)$', flags=cffi_re2.IGNORECASE)
     rgx_cs = cffi_re2.compile(r'a(b+)$')
     # Check case sensitive
     assert_is_none(rgx_cs.match("AB"))
     assert_is_none(rgx_cs.match("Ab"))
     assert_is_none(rgx_cs.match("aB"))
     assert_is_none(rgx_cs.match("aBb"))
     assert_is_none(rgx_cs.match("abB"))
     assert_is_not_none(rgx_cs.match("ab"))
     assert_is_not_none(rgx_cs.match("abb"))
     # Check case insensitive
     assert_is_not_none(rgx_ci.match("AB"))
     assert_is_not_none(rgx_ci.match("Ab"))
     assert_is_not_none(rgx_ci.match("aB"))
     assert_is_not_none(rgx_ci.match("aBb"))
     assert_is_not_none(rgx_ci.match("abB"))
     assert_is_not_none(rgx_ci.match("ab"))
     assert_is_not_none(rgx_ci.match("abb"))
     # Official example
     assert_equal(
         cffi_re2.sub(r'\sAND\s',
                      ' & ',
                      'Baked Beans And Spam',
                      flags=cffi_re2.IGNORECASE), 'Baked Beans & Spam')
예제 #2
0
 def test_basic_match(self):
     # Search-type regex should NOT match full string
     robj = cffi_re2.compile(r'b+')
     assert_is_none(robj.match('abbcd'))
     # This regex only matches the left end
     robj = cffi_re2.compile(r'[abc]+$')
     assert_is_none(robj.match('abbcd'))
     # Full match regex should match
     robj = cffi_re2.compile(r'[abcd]+')
     assert_is_not_none(robj.match('abbcd'))
     # Regex match should be left-anchored, not both-anchored
     robj = cffi_re2.compile(r'a+')
     assert_is_not_none(robj.match('aaab'))
     assert_is_none(robj.match('baaab'))
예제 #3
0
 def test_basic_match(self):
     # Search-type regex should NOT match full string
     robj = cffi_re2.compile(r'b+')
     assert_is_none(robj.match('abbcd'))
     # This regex only matches the left end
     robj = cffi_re2.compile(r'[abc]+$')
     assert_is_none(robj.match('abbcd'))
     # Full match regex should match
     robj = cffi_re2.compile(r'[abcd]+')
     assert_is_not_none(robj.match('abbcd'))
     # Regex match should be left-anchored, not both-anchored
     robj = cffi_re2.compile(r'a+')
     assert_is_not_none(robj.match('aaab'))
     assert_is_none(robj.match('baaab'))
예제 #4
0
 def test_medium_complexity(self):
     """Check medium complexity regexes"""
     # Examples from github.com/ulikoehler/KATranslationCheck
     # 1
     rgx = cffi_re2.compile(r"\b[Ii]nto\b")
     assert_is_not_none(rgx.search("Into the darkness"))
     assert_is_not_none(rgx.search("I went into the darkness"))
     assert_is_none(rgx.search("abcde beintoaqe aqet"))
     # 2
     rgx = cffi_re2.compile(r"\d+\$\s*dollars?")
     assert_is_not_none(rgx.search("12$ dollars"))
     assert_is_not_none(rgx.match("12$ dollars"))
     assert_is_not_none(rgx.match("1$ dollar"))
     assert_is_not_none(rgx.match("1$  dollar"))
     assert_is_not_none(rgx.match("1$  dollars"))
예제 #5
0
 def test_medium_complexity(self):
     """Check medium complexity regexes"""
     # Examples from github.com/ulikoehler/KATranslationCheck
     # 1
     rgx = cffi_re2.compile(r"\b[Ii]nto\b")
     assert_is_not_none(rgx.search("Into the darkness"))
     assert_is_not_none(rgx.search("I went into the darkness"))
     assert_is_none(rgx.search("abcde beintoaqe aqet"))
     # 2
     rgx = cffi_re2.compile(r"\d+\$\s*dollars?")
     assert_is_not_none(rgx.search("12$ dollars"))
     assert_is_not_none(rgx.match("12$ dollars"))
     assert_is_not_none(rgx.match("1$ dollar"))
     assert_is_not_none(rgx.match("1$  dollar"))
     assert_is_not_none(rgx.match("1$  dollars"))
예제 #6
0
def using_cffi_re2(text):
    '''
    It using cffi
     - https://github.com/vls/cffi_re2
    '''
    pattern = cffi_re2.compile(
        ("([a-z0-9!#$%&'*+\/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+\/=?^_`"
         "{|}~-]+)*(@|\sat\s))"))
    return pattern.findall(text)
예제 #7
0
 def compile(self, rgx, flags=0):
     rgx = "({0})".format(rgx)
     self.numRegex += 1
     try:
         return cffi_re2.compile(rgx, flags)
     except ValueError:
         # Enable this for debugging
         # print("Regex in compatibility mode: {0}".format(rgx))
         self.numCompatRegex += 1
         return re.compile(rgx, flags)
예제 #8
0
 def compile(self, rgx, flags=0):
     rgx = "({0})".format(rgx)
     self.numRegex += 1
     try:
         return cffi_re2.compile(rgx, flags)
     except ValueError:
         # Enable this for debugging
         # print("Regex in compatibility mode: {0}".format(rgx))
         self.numCompatRegex += 1
         return re.compile(rgx, flags)
예제 #9
0
def contains(v, regex):
    """Removes meta data from regex then checks for a regex match
    """
    if six.PY3 and isinstance(v, bytes):
        v = v.decode()
    try:
        return cffi_re2.compile(regex.split('\\;')[0],
                                flags=cffi_re2.IGNORECASE).search(v)
    except Exception as E:
        print(str(E))
        print(regex)
        #print(str(v))
        return re.compile(regex.split('\\;')[0], flags=re.IGNORECASE).search(v)
예제 #10
0
 def test_flag_ignorecase(self):
     rgx_ci = cffi_re2.compile(r'a(b+)$', flags=cffi_re2.IGNORECASE)
     rgx_cs = cffi_re2.compile(r'a(b+)$')
     # Check case sensitive
     assert_is_none(rgx_cs.match("AB"))
     assert_is_none(rgx_cs.match("Ab"))
     assert_is_none(rgx_cs.match("aB"))
     assert_is_none(rgx_cs.match("aBb"))
     assert_is_none(rgx_cs.match("abB"))
     assert_is_not_none(rgx_cs.match("ab"))
     assert_is_not_none(rgx_cs.match("abb"))
     # Check case insensitive
     assert_is_not_none(rgx_ci.match("AB"))
     assert_is_not_none(rgx_ci.match("Ab"))
     assert_is_not_none(rgx_ci.match("aB"))
     assert_is_not_none(rgx_ci.match("aBb"))
     assert_is_not_none(rgx_ci.match("abB"))
     assert_is_not_none(rgx_ci.match("ab"))
     assert_is_not_none(rgx_ci.match("abb"))
     # Official example
     assert_equal(cffi_re2.sub(r'\sAND\s', ' & ', 'Baked Beans And Spam', flags=cffi_re2.IGNORECASE),
                  'Baked Beans & Spam')
예제 #11
0
    def __init__(self, lang):
        self.lang = lang
        self.autotrans = RuleAutotranslator()
        # Preindex filter
        # Used to avoid indexing patterns with one instance
        self.preindex_ctr = Counter()  # norm engl hash => count
        self.preindex_min_count = 2  # minimum instances to be considered a pattern
        self.preindex_set = set(
        )  # Compiled from preindex_ctr in clean_preindex()

        self.index = Counter()  # norm engl => count
        self.untranslated_index = Counter()  # norm engl => count
        self.translated_index = defaultdict(
            Counter)  # norm engl => translation => count
        self.filename_index = defaultdict(
            Counter)  # norm_engl => {filename: count}
        self._formula_re = re.compile(r"\$[^\$]+\$")
        self._img_re = get_image_regex()
        self._text = get_text_content_regex()
        self._transURLs = {}  # Translation URL examples
        # NOTE: Need to run indexer TWO TIMES to get accurate results
        # as the text tags first need to be updated to get an accurate IF index
        self.texttags = read_texttag_index(lang)
예제 #12
0
 def test_invalid_regex(self):
     p = '(?!=.*[没不])'
     robj = cffi_re2.compile(p)
예제 #13
0
    def test_match_chinese(self):
        robj = cffi_re2.compile('梦[^一-龥]*幻[^一-龥]*西[^一-龥]*游')

        assert_true(robj.search('梦1幻2西3游'))
        assert_false(robj.search('梦倩女幻幽魂西2游'))
def get_input_re():
    return re.compile(r"\[\[☃\s+[a-z-]+\s*\d*\]\]")
def get_formula_re():
    return re.compile(r"\$[^\$]+\$")
def get_start_invariant_regex():
    return re.compile(r"^((>|\s+|-|\\n)*)\s*", re.UNICODE)
예제 #17
0
 def test_basic_findall(self):
     robj = cffi_re2.compile(r'a(b+)')
     mo = robj.findall("abbcdefabbbbca")
     assert_is_not_none(mo)
     assert_equal(mo, ["bb", "bbbb"])
예제 #18
0
 def test_basic_groups(self):
     robj = cffi_re2.compile(r'a(b+)')
     mo = robj.search("abbc")
     assert_is_not_none(mo)
     assert_equal(mo.groups(), ("bb",))
예제 #19
0
 def test_sub_basic(self):
     robj = cffi_re2.compile(r'b+')
     assert_equal(robj.sub('', 'abbcbbd'), 'acd')
예제 #20
0
 def test_sub_chinese(self):
     robj = cffi_re2.compile('梦[^一-龥]*幻[^一-龥]*西[^一-龥]*游')
     assert_equal(robj.sub('倩女', '梦幻西游好玩吗?'), u'倩女好玩吗?')
예제 #21
0
    def test_match_chinese(self):
        robj = cffi_re2.compile('梦[^一-龥]*幻[^一-龥]*西[^一-龥]*游')

        assert_true(robj.search('梦1幻2西3游'))
        assert_false(robj.search('梦倩女幻幽魂西2游'))
        for i in range(iteration):
            email_list = extract_emails(df_html["html"], df_html["url"], reg)
        
        end_time = time.time()
        total_time = end_time-start_time
        python_engine_list.append(total_time)
        print("total time (in seconds) for " + str(iteration) + " is ", end_time-start_time)
    return email_list, python_engine_list


if __name__ == "__main__":  # confirms that the code is under main function

    df_html = pd.read_csv("/home/ubuntu/server_files/us_fda_raw_html.csv")

    iteration_list = [10,20,40,80,160,320,640]
    reg = re.compile("([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)")
    print("profiling Python 3 regex engine\n")
    email_list_py, python_engine_list = profile_email_regex(reg, iteration_list, df_html)
    
    print("profiling re2 regex engine\n")
    reg = cffi_re2.compile("([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)")
    email_list_re2, re2_engine_list = profile_email_regex(reg, iteration_list, df_html)
    
    df_emails_re2 = pd.DataFrame(email_list_re2)
    df_emails_re2.to_csv("/home/ubuntu/server_files/emails_re2.csv")
    
    df_emails_py = pd.DataFrame(email_list_py)
    df_emails_py.to_csv("/home/ubuntu/server_files/emails_py.csv")
    
    df_profile = pd.DataFrame({"iteration_no":iteration_list, "python_engine_time": python_engine_list, "re2_engine_time": re2_engine_list})
    df_profile.to_csv("/home/ubuntu/server_files/profile.csv")
예제 #23
0
 def test_sub_basic(self):
     robj = cffi_re2.compile(r'b+')
     assert_equal(robj.sub('', 'abbcbbd'), 'acd')
예제 #24
0
 def test_basic_findall(self):
     robj = cffi_re2.compile(r'a(b+)')
     mo = robj.findall("abbcdefabbbbca")
     assert_is_not_none(mo)
     assert_equal(mo, ["bb", "bbbb"])
예제 #25
0
#!/usr/bin/env python3
import argparse
import os.path
import json
from check import readPOFiles
import cffi_re2 as re2
import simplejson as json

imageRegex = re2.compile(r"https?://ka-perseus-(images|graphie)\.s3\.amazonaws.com/([a-z0-9]+)\.(jpeg|jpg|png)")
graphieRegex = re2.compile(r"web\+graphie://ka-perseus-graphie\.s3\.amazonaws.com/([a-z0-9]+)")

images = set()
graphie = set()

def findInPO(po):
    for entry in po:
        engl = entry.msgid
        trans = entry.msgstr

        for hit in imageRegex.findall(engl) + imageRegex.findall(trans):
            images.add("{}.{}".format(hit[1], hit[2]))

        for hit in graphieRegex.findall(engl) + graphieRegex.findall(trans):
            graphie.add(hit)

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('-l', '--language', default="de", help='The language to use')
    args = parser.parse_args()

    po = readPOFiles(os.path.join("cache", args.language))
def get_end_invariant_regex():
    # Apply to reversed string
    return re.compile(r"^((n\\|[\.\?,!\s]+|\]\]\d*\s*[a-z-]+\s+☃\s*\[\[)*)\s*",
                      re.UNICODE)
예제 #27
0
def test_match_chinese():
    robj = cffi_re2.compile('梦[^一-龥]*幻[^一-龥]*西[^一-龥]*游')

    assert robj.search('梦1幻2西3游')
    assert not robj.search('梦倩女幻幽魂西2游')
def get_text_content_regex():
    return re.compile(r"(\\text\s*\{\s*)([^\}]+?)(\s*\})")
예제 #29
0
def test_sub_basic():
    robj = cffi_re2.compile('b+')

    assert robj.sub('', 'abbcbbd') == 'acd'
def get_image_regex():
    return re.compile(
        r"((!\[([^\]]+)?\]\()?\s*(http|https|web\+graphie):\/\/(ka-perseus-(images|graphie)\.s3\.amazonaws\.com|fastly\.kastatic\.org\/ka-perseus-graphie)\/[0-9a-f]+(\.(svg|png|jpg))?\)?)"
    )
예제 #31
0
def test_sub_chinese():
    robj = cffi_re2.compile('梦[^一-龥]*幻[^一-龥]*西[^一-龥]*游')

    assert robj.sub('倩女', '梦幻西游好玩吗?') == '倩女好玩吗?'
예제 #32
0
 def test_basic_search(self):
     robj = cffi_re2.compile(r'b+')
     assert_is_not_none(robj.search('abbcd'))
예제 #33
0
def test_invalid_regex():
    p = '(?!=.*[没不])'
    robj = cffi_re2.compile(p)
예제 #34
0
#!/usr/bin/env python3
import argparse
import os.path
import json
from check import readPOFiles
try:
    import cffi_re2 as re2
except ImportError:
    import re as re2
import simplejson as json

imageRegex = re2.compile(
    r"https?://ka-perseus-(images|graphie)\.s3\.amazonaws.com/([a-z0-9]+)\.(jpeg|jpg|png)"
)
graphieRegex = re2.compile(
    r"web\+graphie://ka-perseus-graphie\.s3\.amazonaws.com/([a-z0-9]+)")

images = set()
graphie = set()


def findInPO(po):
    for entry in po:
        engl = entry.msgid
        trans = entry.msgstr

        for hit in imageRegex.findall(engl) + imageRegex.findall(trans):
            images.add("{}.{}".format(hit[1], hit[2]))

        for hit in graphieRegex.findall(engl) + graphieRegex.findall(trans):
            graphie.add(hit)
예제 #35
0
def test_match_basic():
    robj = cffi_re2.compile('b+')
    flag = robj.search('abbcd')
    assert flag
예제 #36
0
 def test_sub_chinese(self):
     robj = cffi_re2.compile('梦[^一-龥]*幻[^一-龥]*西[^一-龥]*游')
     assert_equal(robj.sub('倩女', '梦幻西游好玩吗?'), u'倩女好玩吗?')
 def __init__(self):
     self.index = Counter()
     self.translated_index = {}
     self.autotranslator = RuleAutotranslator()
     self._re = re.compile(r"\d")
예제 #38
0
 def test_invalid_regex_2(self):
     p = '(?<![没不])'
     robj = cffi_re2.compile(p)
def read_texttag_index(lang):
    try:
        texttags = read_patterns(lang, "texttags")
        return {
            v["english"]: v["translated"]
            for v in texttags
            # Ignore empty string == untranslated
            if (v["translated"] or (
                v["english"] == "" and v["translated"] == ""))
        }
    except FileNotFoundError:
        return {}


import re
_numeric_only_re = re.compile(r"^\d+(\.\d+)?$", re.UNICODE)


def is_numeric_only(s):
    if s is None:
        return False
    return _numeric_only_re.match(s) is not None


def pattern_list_to_xliff(patterns):
    """
    Convert a JSON list to a XLIFF soup
    """
    # Read template XLIFF
    with open("template.xliff") as infile:
        soup = BeautifulSoup(infile, "lxml-xml")
예제 #40
0
 def test_basic_groups(self):
     robj = cffi_re2.compile(r'a(b+)')
     mo = robj.search("abbc")
     assert_is_not_none(mo)
     assert_equal(mo.groups(), ("bb", ))
def get_text_regex():
    exceptions = ["cm", "m", "g", "kg", "s", "min", "max", "h", "cm"]
    exc_clause = "".join([r"(?! ?" + ex + r"\})" for ex in exceptions])
    regex = r"(\\(text|mathrm|textit|textbf)\s*\{" + exc_clause + r")"
    return re.compile(regex)
예제 #42
0
def test_invalid_regex_2():
    p = '(?<![没不])'
    robj = cffi_re2.compile(p)
예제 #43
0
 def test_basic_search(self):
     robj = cffi_re2.compile(r'b+')
     assert_is_not_none(robj.search('abbcd'))