Python Lexer Examples, emparser.preprocess.Lexer Python Examples

Example #1

0

Show file

File: test_mainprocess.py Project: mimosa-project/emparser

class TestParser:
    @pytest.fixture(scope='function', autouse=True)
    def preparare_instance(self):
        self.parser = Parser()
        self.lexer = Lexer()
        self.lexer.load_symbol_dict(common.MML_VCT)
        self.lexer.build_len2symbol()
        yield

    def test_parse_theorem_1(self):
        case = "theorem ( ( for r , s , t holds ( r __O_* s ) __O_* t = r __O_* ( s __O_* t ) ) \n" + \
            "& ex t st for s1 holds s1 __O_* t = s1 & t __O_* s1 = s1 & ex s2 st s1 __O_* s2 \n" + \
            "= t & s2 __O_* s1 = t ) implies S is __M_Group ;"

        xmlstr = self.parser.parse_theorem(case)
        # xml_root = self.parser.parse_theorem(case)
        # xmlstr = util.pretty_xml(xml_root)

        output_path = common.OUTPUT_DIR + '/theorem1.xml'
        # output_path = common.EXPECT_DIR + '/main/theorem1.xml'
        with open(output_path, 'w') as file:
            file.write(xmlstr)

        expect_path = common.EXPECT_DIR + '/main/theorem1.xml'
        assert filecmp.cmp(expect_path, output_path, shallow=False)

Example #2

0

Show file

File: test_mainprocess.py Project: mimosa-project/emparser

class TestMizarErrorListener:
    @pytest.fixture(scope='function', autouse=True)
    def preparare_instance(self):
        self.lexer = Lexer()
        self.lexer.load_symbol_dict(common.MML_VCT)
        self.lexer.build_len2symbol()
        self.parser = Parser()
        yield

Example #3

0

Show file

 def prepare_instance(self):
     self.lexer = Lexer()
     self.parser = Parser()
     self.handler = CSTHandler()
     yield

Example #4

0

Show file

class TestPerformance:
    @pytest.fixture(scope='function', autouse=True)
    def prepare_instance(self):
        self.lexer = Lexer()
        self.parser = Parser()
        self.handler = CSTHandler()
        yield

    
    @pytest.mark.slow
    def test_performance_1(self):
        """
        current performance test result:
        #1 : 3.743171691894531e-05
        #2 : 0.0007483959197998047
        #3 : 0.0007982254028320312
        #4 : 0.006863117218017578
        #5 : 0.010075092315673828
        #6 : 0.16891860961914062
        #7 : 0.41028857231140137
        #8 : 0.0686800479888916
        #9 : 0.015757322311401367
        #10 : 0.001768350601196289
        """
        print('#test_performance_1#')
        t0 = time.time()
        self.lexer.clear()
        self.lexer.build_len2symbol()

        t1 = time.time()
        print(f'#1 : {t1 - t0}')

        mizpath = common.DATA_DIR + '/ring_1.miz'
        with open(mizpath, 'r') as f:
            lines = f.readlines()
        
        t2 = time.time()
        print(f'#2 : {t2 - t1}')

        env_lines, tp_lines = self.lexer.separate_env_and_text_proper(lines)
        env_lines = self.lexer.remove_comment(env_lines)
        tp_lines = self.lexer.remove_comment(tp_lines)
        
        t3 = time.time()
        print(f'#3 : {t3 - t2}')

        env_tokens, env_posmap = self.lexer.lex(env_lines, is_environment_part=True)
        # env_xmlroot = self.parser.parse_environment('\n'.join(env_tokens), env_posmap)
        env_xmlstr = self.parser.parse_environment('\n'.join(env_tokens), env_posmap)
        env_xmlroot = ET.fromstring(env_xmlstr)
        vocfiles = self.handler.extract_vocabularies(env_xmlroot)
        # print(vocfiles)

        t4 = time.time()
        print(f'#4 : {t4 - t3}')

        self.lexer.load_symbol_dict(common.MML_VCT, vocfiles)
        self.lexer.build_len2symbol()

        t5 = time.time()
        print(f'#5 : {t5 - t4}')

        # print(self.lexer.symbol_dict)
        # print(self.lexer.len2symbol)

        tp_tokens, tp_posmap = self.lexer.lex(tp_lines, first_line_number=len(env_lines)+1)

        t6 = time.time()
        print(f'#6 : {t6 - t5}')

        # tp_xmlroot = self.parser.parse_text_proper('\n'.join(tp_tokens), tp_posmap)
        tp_xmlstr = self.parser.parse_text_proper('\n'.join(tp_tokens), tp_posmap)
        tp_xmlroot = ET.fromstring(tp_xmlstr)

        t7 = time.time()
        print(f'#7 : {t7 - t6}')

        self.handler.adjust_type_expression(tp_xmlroot)
        self.handler.adjust_term_expression(tp_xmlroot)
        self.handler.remove_prefix(tp_xmlroot)

        t8 = time.time()
        print(f'#8 : {t8 - t7}')

        # tp_xmlstr = util.pretty_xml(tp_xmlroot)
        tp_xmlstr = ET.tostring(tp_xmlroot, pretty_print=True).decode('utf-8')

        t9 = time.time()
        print(f'#9 : {t9 - t8}')

        output_path = common.OUTPUT_DIR + '/performance_1.xml'
        # output_path = common.EXPECT_DIR + '/performance/performance_1.xml'
        with open(output_path, 'w') as file:
            file.write(tp_xmlstr)

        t10 = time.time()
        print(f'#10 : {t10 - t9}')

        expect_path = common.EXPECT_DIR + '/performance/performance_1.xml'
        assert filecmp.cmp(expect_path, output_path, shallow=False)

    @pytest.mark.slow
    def test_performance_2(self):
        """
        current performance test result:

        #1 : 0.00012969970703125
        #2 : 0.011978864669799805
        #3 : 0.008319616317749023
        #4 : 0.006693124771118164
        #5 : 0.005953550338745117
        #6 : 2.4441311359405518
        #7 : 14.36815881729126
        #8 : 1.1940639019012451
        #9 : 0.3822813034057617
        #10 : 0.08751440048217773
        """

        print('#test_performance_2#')
        t0 = time.time()
        self.lexer.clear()
        self.lexer.build_len2symbol()

        t1 = time.time()
        print(f'#1 : {t1 - t0}')

        mizpath = common.DATA_DIR + '/jgraph_4.miz'
        with open(mizpath, 'r') as f:
            lines = f.readlines()
        
        t2 = time.time()
        print(f'#2 : {t2 - t1}')

        env_lines, tp_lines = self.lexer.separate_env_and_text_proper(lines)
        env_lines = self.lexer.remove_comment(env_lines)
        tp_lines = self.lexer.remove_comment(tp_lines)
        
        t3 = time.time()
        print(f'#3 : {t3 - t2}')

        env_tokens, env_posmap = self.lexer.lex(env_lines, is_environment_part=True)
        # env_xmlroot = self.parser.parse_environment('\n'.join(env_tokens), env_posmap)
        env_xmlstr = self.parser.parse_environment('\n'.join(env_tokens), env_posmap)
        env_xmlroot = ET.fromstring(env_xmlstr)
        vocfiles = self.handler.extract_vocabularies(env_xmlroot)
        # print(vocfiles)

        t4 = time.time()
        print(f'#4 : {t4 - t3}')

        self.lexer.load_symbol_dict(common.MML_VCT, vocfiles)
        self.lexer.build_len2symbol()

        t5 = time.time()
        print(f'#5 : {t5 - t4}')

        # print(self.lexer.symbol_dict)
        # print(self.lexer.len2symbol)

        tp_tokens, tp_posmap = self.lexer.lex(tp_lines, first_line_number=len(env_lines)+1)

        t6 = time.time()
        print(f'#6 : {t6 - t5}')

        # tp_xmlroot = self.parser.parse_text_proper('\n'.join(tp_tokens), tp_posmap)
        tp_xmlstr = self.parser.parse_text_proper('\n'.join(tp_tokens), tp_posmap)
        tp_xmlroot = ET.fromstring(tp_xmlstr)

        t7 = time.time()
        print(f'#7 : {t7 - t6}')

        self.handler.adjust_type_expression(tp_xmlroot)
        self.handler.adjust_term_expression(tp_xmlroot)
        self.handler.remove_prefix(tp_xmlroot)

        t8 = time.time()
        print(f'#8 : {t8 - t7}')

        # tp_xmlstr = util.pretty_xml(tp_xmlroot)
        tp_xmlstr = ET.tostring(tp_xmlroot, pretty_print=True).decode('utf-8')

        t9 = time.time()
        print(f'#9 : {t9 - t8}')

        output_path = common.OUTPUT_DIR + '/performance_2.xml'
        # output_path = common.EXPECT_DIR + '/performance/performance_2.xml'
        with open(output_path, 'w') as file:
            file.write(tp_xmlstr)

        t10 = time.time()
        print(f'#10 : {t10 - t9}')

        expect_path = common.EXPECT_DIR + '/performance/performance_2.xml'
        assert filecmp.cmp(expect_path, output_path, shallow=False)

Example #5

0

Show file

 def prepare_instance(self):
     self.lexer = Lexer()
     self.lexer.load_symbol_dict(common.MML_VCT)
     self.lexer.build_len2symbol()

Example #6

0

Show file

class TestLexer:
    @pytest.fixture(scope='function', autouse=True)
    def prepare_instance(self):
        self.lexer = Lexer()
        self.lexer.load_symbol_dict(common.MML_VCT)
        self.lexer.build_len2symbol()

    def test_separate_env_and_text_proper(self):
        filepath = common.DATA_DIR + '/ring_1.miz'
        with open(filepath, 'r') as f:
            lines = f.readlines()
            env_lines, tp_lines = self.lexer.separate_env_and_text_proper(
                lines)

        assert len(env_lines) == 40
        assert len(tp_lines) == 1234 - 40

    def test_load_symbol_dict(self):
        # 1. load symbol in specified Mizar files
        self.lexer.load_symbol_dict(common.MML_VCT,
                                    ["AFF_1", "AFF_2", "AFVECT0"])
        # HIDDEN -> 4, AFF_1 -> 1, AFF_2 -> 14, AFVECT0 -> 10
        assert len(self.lexer.symbol_dict) == 4 + 1 + 14 + 10

        # 2. load all symbols in MML
        # pprint.pprint(self.lexer.symbol_dict)
        self.lexer.load_symbol_dict(common.MML_VCT)
        assert len(self.lexer.symbol_dict) == 9214
        assert self.lexer.symbol_dict['zeros'] == {
            'filename': 'PRVECT_1',
            'type': 'O'
        }

    def test_bulid_len2symbol(self):
        # pprint.pprint(self.lexer.len2symbol)
        assert len(self.lexer.len2symbol) == 45

    def test_read_until_space(self):
        cases = [
            ["abc def ghi", "abc"],
            ["abc__8()\nfaa ghi", "abc__8()"],
            ["abc__8()faaghi", "abc__8()faaghi"],
        ]

        for case in cases:
            res = self.lexer.read_until_space(case[0])
            assert res == case[1]

    def test_read_identifier(self):
        cases = [["abc def ghi", "abc"], ["abC_d2Ef3 ghi", "abC_d2Ef3"],
                 ["abC_d2(.Ef3 ghi", "abC_d2"],
                 ["a'abC_d2_'(.Ef3 ghi", "a'abC_d2_'"], [" def ghi", ""]]

        for case in cases:
            res = self.lexer.read_identifier(case[0])
            assert res == case[1]

    def test_is_word_boundary(self):
        cases = [
            [('a', 'b'), False],
            [('_', 'b'), True],
            [('a', '0'), False],
            [("'", 'b'), True],
            [('a', "'"), True],
            [('a', '('), True],
            [(')', 'b'), True],
            [(')', '('), True],
        ]

        for case in cases:
            res = self.lexer.is_word_boundary(*case[0])
            assert res == case[1]

    def test_cut_symbol(self):
        cases = [[".abc def ghi", ("__O100_.", "abc def ghi")],
                 ["..abc def ghi", ("__O100_..", "abc def ghi")],
                 ["||..abc def ghi", ("__K_||..", "abc def ghi")],
                 ["abss def ghi", None], [",;:abc||def", (",", ";:abc||def")],
                 [",||;:abcdef", (",", "||;:abcdef")],
                 ["$1,abcdef", ("$1", ",abcdef")],
                 ["...||abcdef", ("...", "||abcdef")],
                 ["||abcdef", ('__O100_||', 'abcdef')], ["= a", ('=', ' a')],
                 ["& sup I in I;", ('&', ' sup I in I;')]]

        for case in cases:
            res = self.lexer.cut_symbol(case[0])
            assert res == case[1]

    def test_cut_reserved_word(self):
        cases = [
            ["qua;abc def", ("qua", ";abc def")],
            ["associativity\nsuppose", ("associativity", "\nsuppose")],
            ["abc def", None],
        ]

        for case in cases:
            res = self.lexer.cut_reserved_word(case[0])
            assert res == case[1]

    def test_cut_identifier(self):
        cases = [
            ["ABC;abc def", ("ABC", ";abc def")],
            ["ABC abc def", ("ABC", " abc def")],
            ["123 abc, def", None],
        ]

        for case in cases:
            res = self.lexer.cut_identifier(case[0])
            assert res == case[1]

    def test_cut_numeral(self):
        cases = [
            ["123;abc def", ("123", ";abc def")],
            ["456 abc def", ("456", " abc def")],
            ["1 abc def", ("1", " abc def")],
            ["0 abc def", ("0", " abc def")],
            ["012 abc def", None],
            ["ABC abc def", None],
        ]

        for case in cases:
            res = self.lexer.cut_numeral(case[0])
            assert res == case[1]

    def test_remove_comment_in_a_line(self):
        cases = [
            "theorem :: ABCMIZ_0:1",
            "holds ex_sup_of I, T & sup I in I; :: this is a comment",
            ":: everything is comment"
        ]

        expects = ["theorem ", "holds ex_sup_of I, T & sup I in I; ", ""]

        for case, expect in zip(cases, expects):
            res = self.lexer.remove_comment_in_a_line(case)
            assert res == expect

    def test_remove_comment(self):
        case1 = [
            "theorem :: ABCMIZ_0:1",
            "for T being Noetherian sup-Semilattice for I being Ideal of T",
            "holds ex_sup_of I, T & sup I in I;"
        ]

        expect1 = [
            "theorem ",
            "for T being Noetherian sup-Semilattice for I being Ideal of T",
            "holds ex_sup_of I, T & sup I in I;"
        ]

        res1 = self.lexer.remove_comment(case1)
        assert expect1 == res1

        case2 = [
            "theorem :: ABCMIZ_0:1",
            "for T being Noetherian sup-Semilattice for I being Ideal of T",
            "holds ex_sup_of I, T & sup I in I;"
        ]

        expect2 = [
            "theorem ",
            "for T being Noetherian sup-Semilattice for I being Ideal of T",
            "holds ex_sup_of I, T & sup I in I;"
        ]

        res2 = self.lexer.remove_comment(case2)
        assert expect2 == res2

    def test_lex(self):
        case1 = [
            "theorem ",
            "for T being Noetherian sup-Semilattice for I being Ideal of T",
            "holds ex_sup_of I, T & sup I in I;"
        ]

        expect1 = [
            "theorem",
            "for T being __V_Noetherian __M_sup-Semilattice for I being __M_Ideal of T",
            "holds __R_ex_sup_of I , T & __O200_sup I __R_in I ;"
        ]
        text1, pos_map1 = self.lexer.lex(case1)
        assert expect1 == text1

        case2 = [
            "theorem",
            "((for r,s,t holds (r * s) * t = r * (s * t)) & ex t st for s1 holds s1",
            "* t = s1 & t * s1 = s1 & ex s2 st s1 * s2 = t & s2 * s1 = t) implies S is Group;"
        ]

        expect2 = [
            "theorem",
            "( ( for r , s , t holds ( r __O_* s ) __O_* t = r __O_* ( s __O_* t ) ) & ex t st for s1 holds s1",
            "__O_* t = s1 & t __O_* s1 = s1 & ex s2 st s1 __O_* s2 = t & s2 __O_* s1 = t ) implies S is __M_Group ;"
        ]

        text2, pos_map2 = self.lexer.lex(case2)
        assert expect2 == text2

        case3 = [
            "theorem",
            "for F be add-associative right_zeroed right_complementable",
            "right-distributive non empty doubleLoopStr, x,y,z being Element of F holds",
            "x*(y-z) = x*y - x*z;",
        ]

        expect3 = [
            'theorem',
            'for F be __V_add-associative __V_right_zeroed __V_right_complementable',
            '__V_right-distributive non __V_empty __G_doubleLoopStr , x , y , z being __M_Element of F holds',
            'x __O_* ( y __O32_- z ) = x __O_* y __O32_- x __O_* z ;',
        ]

        text3, pos_map3 = self.lexer.lex(case3)
        assert expect3 == text3

        case4 = [
            "theorem",
            "for V being add-associative right_zeroed right_complementable non",
            "empty addLoopStr, u,v,w being Element of V holds -(v+w)=-w-v & -(w+-v)=v-w & -",
            "(v-w)=w+-v & -(-v-w)=w+v & u-(w+v)=u-v-w;",
        ]

        expect4 = [
            'theorem',
            'for V being __V_add-associative __V_right_zeroed __V_right_complementable non',
            '__V_empty __G_addLoopStr , u , v , w being __M_Element of V holds __O32_- ( v __O32_+ w ) = __O32_- w __O32_- v & __O32_- ( w __O32_+ __O32_- v ) = v __O32_- w & __O32_-',
            '( v __O32_- w ) = w __O32_+ __O32_- v & __O32_- ( __O32_- v __O32_- w ) = w __O32_+ v & u __O32_- ( w __O32_+ v ) = u __O32_- v __O32_- w ;',
        ]

        text4, pos_map4 = self.lexer.lex(case4)
        assert expect4 == text4

        case5 = [
            "definition", "  let K be non empty multMagma, S be Subset of K;",
            "  attr S is quasi-prime means",
            "for a, b being Element of K st a*b in S holds a in S or b in S;",
            "end;"
        ]

        expect5 = [
            "definition",
            "let K be non __V_empty __G_multMagma , S be __M_Subset of K ;",
            "attr S is __V_quasi-prime means",
            "for a , b being __M_Element of K st a __O_* b __R_in S holds a __R_in S or b __R_in S ;",
            "end ;"
        ]

        text5, pos_map5 = self.lexer.lex(case5)
        assert expect5 == text5

        case6 = [
            "registration",
            "let K be non empty multLoopStr;",
            "cluster prime -> proper quasi-prime for Subset of K;",
            "cluster proper quasi-prime -> prime for Subset of K;",
            "end;",
        ]

        expect6 = [
            "registration",
            "let K be non __V_empty __G_multLoopStr ;",
            "cluster __V_prime -> __V_proper __V_quasi-prime for __M_Subset of K ;",
            "cluster __V_proper __V_quasi-prime -> __V_prime for __M_Subset of K ;",
            "end ;",
        ]

        text6, pos_map6 = self.lexer.lex(case6)
        assert expect6 == text6

        case7 = [
            "notation",
            "let R be Ring, I be Ideal of R;",
            "synonym R/I for QuotientRing(R,I);",
            "end;",
        ]

        expect7 = [
            "notation",
            "let R be __M_Ring , I be __M_Ideal of R ;",
            "synonym R __O_/ I for __O_QuotientRing ( R , I ) ;",
            "end ;",
        ]

        text7, pos_map7 = self.lexer.lex(case7)
        assert expect7 == text7

        case8 = [
            "scheme",
            "NatInd { P[Nat] } : for k being Nat holds P[k]",
            "provided",
            "P[0] and",
            "for k be Nat st P[k] holds P[k + 1];",
        ]

        expect8 = [
            "scheme",
            "NatInd { P [ __M_Nat ] } : for k being __M_Nat holds P [ k ]",
            "provided",
            "P [ __O_0 ] and",
            "for k be __M_Nat st P [ k ] holds P [ k __O32_+ 1 ] ;",
        ]

        text8, pos_map8 = self.lexer.lex(case8)
        assert expect8 == text8

Example #7

0

Show file

File: parse_abs.py Project: mimosa-project/emsearch

import os
from emparser.preprocess import Lexer
import re
import pickle
from pathlib import Path
import glob
from collections import defaultdict
import codecs

DATA_DIR = Path("emparser/data")
ABS_DIR = os.path.join(DATA_DIR, 'mml.vct')
lexer = Lexer()
lexer.load_symbol_dict(ABS_DIR)
lexer.build_len2symbol()
RESERVED_WORDS = set([
    "according", "aggregate", "all", "and", "antonym", "are", "as",
    "associativity", "assume", "asymmetry", "attr", "be", "begin", "being",
    "by", "canceled", "case", "cases", "cluster", "coherence", "commutativity",
    "compatibility", "connectedness", "consider", "consistency",
    "constructors", "contradiction", "correctness", "def", "deffunc", "define",
    "definition", "definitions", "defpred", "do", "does", "end", "environ",
    "equals", "ex", "exactly", "existence", "for", "from", "func", "given",
    "hence", "hereby", "holds", "idempotence", "identify", "if", "iff",
    "implies", "involutiveness", "irreflexivity", "is", "it", "let", "means",
    "mode", "non", "not", "notation", "notations", "now", "of", "or",
    "otherwise", "over", "per", "pred", "prefix", "projectivity", "proof",
    "provided", "qua", "reconsider", "reduce", "reducibility", "redefine",
    "reflexivity", "registration", "registrations", "requirements", "reserve",
    "sch", "scheme", "schemes", "section", "selector", "set", "sethood", "st",
    "struct", "such", "suppose", "symmetry", "synonym", "take", "that", "the",
    "then", "theorem", "theorems", "thesis", "thus", "to", "transitivity",