class TestParser: @pytest.fixture(scope='function', autouse=True) def preparare_instance(self): self.parser = Parser() self.lexer = Lexer() self.lexer.load_symbol_dict(common.MML_VCT) self.lexer.build_len2symbol() yield def test_parse_theorem_1(self): case = "theorem ( ( for r , s , t holds ( r __O_* s ) __O_* t = r __O_* ( s __O_* t ) ) \n" + \ "& ex t st for s1 holds s1 __O_* t = s1 & t __O_* s1 = s1 & ex s2 st s1 __O_* s2 \n" + \ "= t & s2 __O_* s1 = t ) implies S is __M_Group ;" xmlstr = self.parser.parse_theorem(case) # xml_root = self.parser.parse_theorem(case) # xmlstr = util.pretty_xml(xml_root) output_path = common.OUTPUT_DIR + '/theorem1.xml' # output_path = common.EXPECT_DIR + '/main/theorem1.xml' with open(output_path, 'w') as file: file.write(xmlstr) expect_path = common.EXPECT_DIR + '/main/theorem1.xml' assert filecmp.cmp(expect_path, output_path, shallow=False)
class TestMizarErrorListener: @pytest.fixture(scope='function', autouse=True) def preparare_instance(self): self.lexer = Lexer() self.lexer.load_symbol_dict(common.MML_VCT) self.lexer.build_len2symbol() self.parser = Parser() yield
def prepare_instance(self): self.lexer = Lexer() self.parser = Parser() self.handler = CSTHandler() yield
class TestPerformance: @pytest.fixture(scope='function', autouse=True) def prepare_instance(self): self.lexer = Lexer() self.parser = Parser() self.handler = CSTHandler() yield @pytest.mark.slow def test_performance_1(self): """ current performance test result: #1 : 3.743171691894531e-05 #2 : 0.0007483959197998047 #3 : 0.0007982254028320312 #4 : 0.006863117218017578 #5 : 0.010075092315673828 #6 : 0.16891860961914062 #7 : 0.41028857231140137 #8 : 0.0686800479888916 #9 : 0.015757322311401367 #10 : 0.001768350601196289 """ print('#test_performance_1#') t0 = time.time() self.lexer.clear() self.lexer.build_len2symbol() t1 = time.time() print(f'#1 : {t1 - t0}') mizpath = common.DATA_DIR + '/ring_1.miz' with open(mizpath, 'r') as f: lines = f.readlines() t2 = time.time() print(f'#2 : {t2 - t1}') env_lines, tp_lines = self.lexer.separate_env_and_text_proper(lines) env_lines = self.lexer.remove_comment(env_lines) tp_lines = self.lexer.remove_comment(tp_lines) t3 = time.time() print(f'#3 : {t3 - t2}') env_tokens, env_posmap = self.lexer.lex(env_lines, is_environment_part=True) # env_xmlroot = self.parser.parse_environment('\n'.join(env_tokens), env_posmap) env_xmlstr = self.parser.parse_environment('\n'.join(env_tokens), env_posmap) env_xmlroot = ET.fromstring(env_xmlstr) vocfiles = self.handler.extract_vocabularies(env_xmlroot) # print(vocfiles) t4 = time.time() print(f'#4 : {t4 - t3}') self.lexer.load_symbol_dict(common.MML_VCT, vocfiles) self.lexer.build_len2symbol() t5 = time.time() print(f'#5 : {t5 - t4}') # print(self.lexer.symbol_dict) # print(self.lexer.len2symbol) tp_tokens, tp_posmap = self.lexer.lex(tp_lines, first_line_number=len(env_lines)+1) t6 = time.time() print(f'#6 : {t6 - t5}') # tp_xmlroot = self.parser.parse_text_proper('\n'.join(tp_tokens), tp_posmap) tp_xmlstr = self.parser.parse_text_proper('\n'.join(tp_tokens), tp_posmap) tp_xmlroot = ET.fromstring(tp_xmlstr) t7 = time.time() print(f'#7 : {t7 - t6}') self.handler.adjust_type_expression(tp_xmlroot) self.handler.adjust_term_expression(tp_xmlroot) self.handler.remove_prefix(tp_xmlroot) t8 = time.time() print(f'#8 : {t8 - t7}') # tp_xmlstr = util.pretty_xml(tp_xmlroot) tp_xmlstr = ET.tostring(tp_xmlroot, pretty_print=True).decode('utf-8') t9 = time.time() print(f'#9 : {t9 - t8}') output_path = common.OUTPUT_DIR + '/performance_1.xml' # output_path = common.EXPECT_DIR + '/performance/performance_1.xml' with open(output_path, 'w') as file: file.write(tp_xmlstr) t10 = time.time() print(f'#10 : {t10 - t9}') expect_path = common.EXPECT_DIR + '/performance/performance_1.xml' assert filecmp.cmp(expect_path, output_path, shallow=False) @pytest.mark.slow def test_performance_2(self): """ current performance test result: #1 : 0.00012969970703125 #2 : 0.011978864669799805 #3 : 0.008319616317749023 #4 : 0.006693124771118164 #5 : 0.005953550338745117 #6 : 2.4441311359405518 #7 : 14.36815881729126 #8 : 1.1940639019012451 #9 : 0.3822813034057617 #10 : 0.08751440048217773 """ print('#test_performance_2#') t0 = time.time() self.lexer.clear() self.lexer.build_len2symbol() t1 = time.time() print(f'#1 : {t1 - t0}') mizpath = common.DATA_DIR + '/jgraph_4.miz' with open(mizpath, 'r') as f: lines = f.readlines() t2 = time.time() print(f'#2 : {t2 - t1}') env_lines, tp_lines = self.lexer.separate_env_and_text_proper(lines) env_lines = self.lexer.remove_comment(env_lines) tp_lines = self.lexer.remove_comment(tp_lines) t3 = time.time() print(f'#3 : {t3 - t2}') env_tokens, env_posmap = self.lexer.lex(env_lines, is_environment_part=True) # env_xmlroot = self.parser.parse_environment('\n'.join(env_tokens), env_posmap) env_xmlstr = self.parser.parse_environment('\n'.join(env_tokens), env_posmap) env_xmlroot = ET.fromstring(env_xmlstr) vocfiles = self.handler.extract_vocabularies(env_xmlroot) # print(vocfiles) t4 = time.time() print(f'#4 : {t4 - t3}') self.lexer.load_symbol_dict(common.MML_VCT, vocfiles) self.lexer.build_len2symbol() t5 = time.time() print(f'#5 : {t5 - t4}') # print(self.lexer.symbol_dict) # print(self.lexer.len2symbol) tp_tokens, tp_posmap = self.lexer.lex(tp_lines, first_line_number=len(env_lines)+1) t6 = time.time() print(f'#6 : {t6 - t5}') # tp_xmlroot = self.parser.parse_text_proper('\n'.join(tp_tokens), tp_posmap) tp_xmlstr = self.parser.parse_text_proper('\n'.join(tp_tokens), tp_posmap) tp_xmlroot = ET.fromstring(tp_xmlstr) t7 = time.time() print(f'#7 : {t7 - t6}') self.handler.adjust_type_expression(tp_xmlroot) self.handler.adjust_term_expression(tp_xmlroot) self.handler.remove_prefix(tp_xmlroot) t8 = time.time() print(f'#8 : {t8 - t7}') # tp_xmlstr = util.pretty_xml(tp_xmlroot) tp_xmlstr = ET.tostring(tp_xmlroot, pretty_print=True).decode('utf-8') t9 = time.time() print(f'#9 : {t9 - t8}') output_path = common.OUTPUT_DIR + '/performance_2.xml' # output_path = common.EXPECT_DIR + '/performance/performance_2.xml' with open(output_path, 'w') as file: file.write(tp_xmlstr) t10 = time.time() print(f'#10 : {t10 - t9}') expect_path = common.EXPECT_DIR + '/performance/performance_2.xml' assert filecmp.cmp(expect_path, output_path, shallow=False)
def prepare_instance(self): self.lexer = Lexer() self.lexer.load_symbol_dict(common.MML_VCT) self.lexer.build_len2symbol()
class TestLexer: @pytest.fixture(scope='function', autouse=True) def prepare_instance(self): self.lexer = Lexer() self.lexer.load_symbol_dict(common.MML_VCT) self.lexer.build_len2symbol() def test_separate_env_and_text_proper(self): filepath = common.DATA_DIR + '/ring_1.miz' with open(filepath, 'r') as f: lines = f.readlines() env_lines, tp_lines = self.lexer.separate_env_and_text_proper( lines) assert len(env_lines) == 40 assert len(tp_lines) == 1234 - 40 def test_load_symbol_dict(self): # 1. load symbol in specified Mizar files self.lexer.load_symbol_dict(common.MML_VCT, ["AFF_1", "AFF_2", "AFVECT0"]) # HIDDEN -> 4, AFF_1 -> 1, AFF_2 -> 14, AFVECT0 -> 10 assert len(self.lexer.symbol_dict) == 4 + 1 + 14 + 10 # 2. load all symbols in MML # pprint.pprint(self.lexer.symbol_dict) self.lexer.load_symbol_dict(common.MML_VCT) assert len(self.lexer.symbol_dict) == 9214 assert self.lexer.symbol_dict['zeros'] == { 'filename': 'PRVECT_1', 'type': 'O' } def test_bulid_len2symbol(self): # pprint.pprint(self.lexer.len2symbol) assert len(self.lexer.len2symbol) == 45 def test_read_until_space(self): cases = [ ["abc def ghi", "abc"], ["abc__8()\nfaa ghi", "abc__8()"], ["abc__8()faaghi", "abc__8()faaghi"], ] for case in cases: res = self.lexer.read_until_space(case[0]) assert res == case[1] def test_read_identifier(self): cases = [["abc def ghi", "abc"], ["abC_d2Ef3 ghi", "abC_d2Ef3"], ["abC_d2(.Ef3 ghi", "abC_d2"], ["a'abC_d2_'(.Ef3 ghi", "a'abC_d2_'"], [" def ghi", ""]] for case in cases: res = self.lexer.read_identifier(case[0]) assert res == case[1] def test_is_word_boundary(self): cases = [ [('a', 'b'), False], [('_', 'b'), True], [('a', '0'), False], [("'", 'b'), True], [('a', "'"), True], [('a', '('), True], [(')', 'b'), True], [(')', '('), True], ] for case in cases: res = self.lexer.is_word_boundary(*case[0]) assert res == case[1] def test_cut_symbol(self): cases = [[".abc def ghi", ("__O100_.", "abc def ghi")], ["..abc def ghi", ("__O100_..", "abc def ghi")], ["||..abc def ghi", ("__K_||..", "abc def ghi")], ["abss def ghi", None], [",;:abc||def", (",", ";:abc||def")], [",||;:abcdef", (",", "||;:abcdef")], ["$1,abcdef", ("$1", ",abcdef")], ["...||abcdef", ("...", "||abcdef")], ["||abcdef", ('__O100_||', 'abcdef')], ["= a", ('=', ' a')], ["& sup I in I;", ('&', ' sup I in I;')]] for case in cases: res = self.lexer.cut_symbol(case[0]) assert res == case[1] def test_cut_reserved_word(self): cases = [ ["qua;abc def", ("qua", ";abc def")], ["associativity\nsuppose", ("associativity", "\nsuppose")], ["abc def", None], ] for case in cases: res = self.lexer.cut_reserved_word(case[0]) assert res == case[1] def test_cut_identifier(self): cases = [ ["ABC;abc def", ("ABC", ";abc def")], ["ABC abc def", ("ABC", " abc def")], ["123 abc, def", None], ] for case in cases: res = self.lexer.cut_identifier(case[0]) assert res == case[1] def test_cut_numeral(self): cases = [ ["123;abc def", ("123", ";abc def")], ["456 abc def", ("456", " abc def")], ["1 abc def", ("1", " abc def")], ["0 abc def", ("0", " abc def")], ["012 abc def", None], ["ABC abc def", None], ] for case in cases: res = self.lexer.cut_numeral(case[0]) assert res == case[1] def test_remove_comment_in_a_line(self): cases = [ "theorem :: ABCMIZ_0:1", "holds ex_sup_of I, T & sup I in I; :: this is a comment", ":: everything is comment" ] expects = ["theorem ", "holds ex_sup_of I, T & sup I in I; ", ""] for case, expect in zip(cases, expects): res = self.lexer.remove_comment_in_a_line(case) assert res == expect def test_remove_comment(self): case1 = [ "theorem :: ABCMIZ_0:1", "for T being Noetherian sup-Semilattice for I being Ideal of T", "holds ex_sup_of I, T & sup I in I;" ] expect1 = [ "theorem ", "for T being Noetherian sup-Semilattice for I being Ideal of T", "holds ex_sup_of I, T & sup I in I;" ] res1 = self.lexer.remove_comment(case1) assert expect1 == res1 case2 = [ "theorem :: ABCMIZ_0:1", "for T being Noetherian sup-Semilattice for I being Ideal of T", "holds ex_sup_of I, T & sup I in I;" ] expect2 = [ "theorem ", "for T being Noetherian sup-Semilattice for I being Ideal of T", "holds ex_sup_of I, T & sup I in I;" ] res2 = self.lexer.remove_comment(case2) assert expect2 == res2 def test_lex(self): case1 = [ "theorem ", "for T being Noetherian sup-Semilattice for I being Ideal of T", "holds ex_sup_of I, T & sup I in I;" ] expect1 = [ "theorem", "for T being __V_Noetherian __M_sup-Semilattice for I being __M_Ideal of T", "holds __R_ex_sup_of I , T & __O200_sup I __R_in I ;" ] text1, pos_map1 = self.lexer.lex(case1) assert expect1 == text1 case2 = [ "theorem", "((for r,s,t holds (r * s) * t = r * (s * t)) & ex t st for s1 holds s1", "* t = s1 & t * s1 = s1 & ex s2 st s1 * s2 = t & s2 * s1 = t) implies S is Group;" ] expect2 = [ "theorem", "( ( for r , s , t holds ( r __O_* s ) __O_* t = r __O_* ( s __O_* t ) ) & ex t st for s1 holds s1", "__O_* t = s1 & t __O_* s1 = s1 & ex s2 st s1 __O_* s2 = t & s2 __O_* s1 = t ) implies S is __M_Group ;" ] text2, pos_map2 = self.lexer.lex(case2) assert expect2 == text2 case3 = [ "theorem", "for F be add-associative right_zeroed right_complementable", "right-distributive non empty doubleLoopStr, x,y,z being Element of F holds", "x*(y-z) = x*y - x*z;", ] expect3 = [ 'theorem', 'for F be __V_add-associative __V_right_zeroed __V_right_complementable', '__V_right-distributive non __V_empty __G_doubleLoopStr , x , y , z being __M_Element of F holds', 'x __O_* ( y __O32_- z ) = x __O_* y __O32_- x __O_* z ;', ] text3, pos_map3 = self.lexer.lex(case3) assert expect3 == text3 case4 = [ "theorem", "for V being add-associative right_zeroed right_complementable non", "empty addLoopStr, u,v,w being Element of V holds -(v+w)=-w-v & -(w+-v)=v-w & -", "(v-w)=w+-v & -(-v-w)=w+v & u-(w+v)=u-v-w;", ] expect4 = [ 'theorem', 'for V being __V_add-associative __V_right_zeroed __V_right_complementable non', '__V_empty __G_addLoopStr , u , v , w being __M_Element of V holds __O32_- ( v __O32_+ w ) = __O32_- w __O32_- v & __O32_- ( w __O32_+ __O32_- v ) = v __O32_- w & __O32_-', '( v __O32_- w ) = w __O32_+ __O32_- v & __O32_- ( __O32_- v __O32_- w ) = w __O32_+ v & u __O32_- ( w __O32_+ v ) = u __O32_- v __O32_- w ;', ] text4, pos_map4 = self.lexer.lex(case4) assert expect4 == text4 case5 = [ "definition", " let K be non empty multMagma, S be Subset of K;", " attr S is quasi-prime means", "for a, b being Element of K st a*b in S holds a in S or b in S;", "end;" ] expect5 = [ "definition", "let K be non __V_empty __G_multMagma , S be __M_Subset of K ;", "attr S is __V_quasi-prime means", "for a , b being __M_Element of K st a __O_* b __R_in S holds a __R_in S or b __R_in S ;", "end ;" ] text5, pos_map5 = self.lexer.lex(case5) assert expect5 == text5 case6 = [ "registration", "let K be non empty multLoopStr;", "cluster prime -> proper quasi-prime for Subset of K;", "cluster proper quasi-prime -> prime for Subset of K;", "end;", ] expect6 = [ "registration", "let K be non __V_empty __G_multLoopStr ;", "cluster __V_prime -> __V_proper __V_quasi-prime for __M_Subset of K ;", "cluster __V_proper __V_quasi-prime -> __V_prime for __M_Subset of K ;", "end ;", ] text6, pos_map6 = self.lexer.lex(case6) assert expect6 == text6 case7 = [ "notation", "let R be Ring, I be Ideal of R;", "synonym R/I for QuotientRing(R,I);", "end;", ] expect7 = [ "notation", "let R be __M_Ring , I be __M_Ideal of R ;", "synonym R __O_/ I for __O_QuotientRing ( R , I ) ;", "end ;", ] text7, pos_map7 = self.lexer.lex(case7) assert expect7 == text7 case8 = [ "scheme", "NatInd { P[Nat] } : for k being Nat holds P[k]", "provided", "P[0] and", "for k be Nat st P[k] holds P[k + 1];", ] expect8 = [ "scheme", "NatInd { P [ __M_Nat ] } : for k being __M_Nat holds P [ k ]", "provided", "P [ __O_0 ] and", "for k be __M_Nat st P [ k ] holds P [ k __O32_+ 1 ] ;", ] text8, pos_map8 = self.lexer.lex(case8) assert expect8 == text8
import os from emparser.preprocess import Lexer import re import pickle from pathlib import Path import glob from collections import defaultdict import codecs DATA_DIR = Path("emparser/data") ABS_DIR = os.path.join(DATA_DIR, 'mml.vct') lexer = Lexer() lexer.load_symbol_dict(ABS_DIR) lexer.build_len2symbol() RESERVED_WORDS = set([ "according", "aggregate", "all", "and", "antonym", "are", "as", "associativity", "assume", "asymmetry", "attr", "be", "begin", "being", "by", "canceled", "case", "cases", "cluster", "coherence", "commutativity", "compatibility", "connectedness", "consider", "consistency", "constructors", "contradiction", "correctness", "def", "deffunc", "define", "definition", "definitions", "defpred", "do", "does", "end", "environ", "equals", "ex", "exactly", "existence", "for", "from", "func", "given", "hence", "hereby", "holds", "idempotence", "identify", "if", "iff", "implies", "involutiveness", "irreflexivity", "is", "it", "let", "means", "mode", "non", "not", "notation", "notations", "now", "of", "or", "otherwise", "over", "per", "pred", "prefix", "projectivity", "proof", "provided", "qua", "reconsider", "reduce", "reducibility", "redefine", "reflexivity", "registration", "registrations", "requirements", "reserve", "sch", "scheme", "schemes", "section", "selector", "set", "sethood", "st", "struct", "such", "suppose", "symmetry", "synonym", "take", "that", "the", "then", "theorem", "theorems", "thesis", "thus", "to", "transitivity",