def test_glr_recovery_custom_new_position(): """ Test that custom recovery that increment position works. """ def custom_recovery(head, error): # This recovery will just skip over erroneous part of input '& 89'. head.position += 4 return head.parser.default_error_recovery(head) parser = GLRParser(g, actions=actions, error_recovery=custom_recovery) results = parser.parse('1 + 5 & 89 - 2') assert len(parser.errors) == 1 assert len(results) == 2 result_set = set([parser.call_actions(tree) for tree in results]) assert len(result_set) == 1 # Calculated result should be '1 + 5 - 2' assert result_set.pop() == 4
def test_glr_recovery_custom_new_position(): """ Test that custom recovery that increment position works. """ def custom_recovery(context, error): # This recovery will just skip over erroneous part of input '& 89'. return None, context.position + 4 parser = GLRParser(g, actions=actions, error_recovery=custom_recovery, debug=True) results = parser.parse('1 + 5 & 89 - 2') assert len(parser.errors) == 1 assert len(results) == 2 assert len(set(results)) == 1 # Calculate results should be '1 + 5 - 2' assert results[0] == 4
def test_unbounded_ambiguity(): """ This grammar has unbounded ambiguity. Grammar G6 from: Nozohoor-Farshi, Rahman: "GLR Parsing for ε-Grammers" """ grammar = """ S: M N; M: A M "b" | "x"; N: "b" N A | "x"; A: EMPTY; """ g = Grammar.from_string(grammar) p = GLRParser(g) results = p.parse("xbbbbx") assert len(results) == 5
def test_right_nullable(): """ Grammar Γ2 (pp.17) from: Scott, E. and Johnstone, A., 2006. Right nulled GLR parsers. ACM Transactions on Programming Languages and Systems (TOPLAS), 28(4), pp.577-618. """ grammar = """ S: "a" S A | EMPTY; A: EMPTY; """ g = Grammar.from_string(grammar) p = GLRParser(g) results = p.parse("aa") assert len(results) == 1
def test_reduce_enough_many_empty(): """ This is an extension of the previous grammar where parser must reduce enough A B pairs to succeed. The language is the same: xb^n, n>=0 """ grammar = """ S: A B S "b"; S: "x"; A: EMPTY; B: EMPTY; """ g = Grammar.from_string(grammar) p = GLRParser(g) results = p.parse("xbbb") assert len(results) == 1
def test_cyclic_grammar_1(): """ Grammar G1 from the paper: "GLR Parsing for e-Grammers" by Rahman Nozohoor-Farshi """ grammar = """ S: A; A: S; A: 'x'; """ g = Grammar.from_string(grammar) with pytest.raises(SRConflicts): Parser(g, prefer_shifts=False) p = GLRParser(g) results = p.parse('x') # This grammar builds infinite/looping tree # x -> A -> S -> A -> S... with pytest.raises(LoopError): len(results)
def test_lr2_grammar(): grammar = r""" Model: Prods EOF; Prods: Prod | Prods Prod; Prod: ID "=" ProdRefs; ProdRefs: ID | ProdRefs ID; terminals ID: /\w+/; """ input_str = """ First = One Two three Second = Foo Bar Third = Baz """ g = Grammar.from_string(grammar) # This grammar is not LR(1) as it requires # at least two tokens of lookahead to decide # what to do on each ID from the right side. # If '=' is after ID than it should reduce "Prod" # else it should reduce ID as ProdRefs. with pytest.raises(SRConflicts): Parser(g, prefer_shifts=False) # prefer_shifts strategy (the default) # will remove conflicts but the resulting parser # will fail to parse any input as it will consume # greadily next rule ID as the body element of the previous Prod rule. parser = Parser(g) with pytest.raises(ParseError): parser.parse(input_str) # But it can be parsed unambiguously by GLR. p = GLRParser(g) results = p.parse(input_str) assert len(results) == 1
def todo_test_cyclic_grammar_2(): """ From the paper: "GLR Parsing for e-Grammers" by Rahman Nozohoor-Farshi """ grammar = """ S: S S; S: 'x'; S: EMPTY; """ g = Grammar.from_string(grammar) with pytest.raises(SRConflicts): Parser(g, prefer_shifts=False) p = GLRParser(g, debug=True) results = p.parse('xx') # This grammar has infinite ambiguity but by minimizing empty reductions # we shall get only one result xx -> xS -> SS -> S assert len(results) == 1
def test_glr_recovery_custom_new_position(): """ Test that custom recovery that increment position works. """ error = Error(0, 1, message="Error") def custom_recovery(parser, input_str, position, symbols): # This recovery will just skip over erroneous part of input '& 89'. return error, position + 4, None parser = GLRParser(g, actions=actions, error_recovery=custom_recovery, debug=True) results = parser.parse('1 + 5 & 89 - 2') assert len(parser.errors) == 1 assert parser.errors[0] is error assert len(results) == 2 assert len(set(results)) == 1 # Calculate results should be '1 + 5 - 2' assert results[0] == 4
def test_glr_recovery_custom_new_token(): """ Test that custom recovery that introduces new token works. """ error = Error(0, 1, message="Error") def custom_recovery(parser, input_str, position, symbols): # Here we will introduce missing operation token return error, None, Token(g.get_terminal('-'), '-', 0) parser = GLRParser(g, actions=actions, error_recovery=custom_recovery, debug=True) results = parser.parse('1 + 5 8 - 2') assert len(parser.errors) == 1 assert parser.errors[0] is error assert len(results) == 5 assert len(set(results)) == 2 assert -4 in results assert 0 in results
def test_bounded_direct_ambiguity(): """ This grammar has bounded direct ambiguity of degree 2, in spite of being unboundedly ambiguous as for every k we can find a string that will give at least k solutions. The language is t^{m}xb^{n}, n>=m>=0 Grammar G5 from: Nozohoor-Farshi, Rahman: "GLR Parsing for ε-Grammers" """ grammar = """ S: A S "b" | "x"; A: "t" | EMPTY; """ g = Grammar.from_string(grammar) p = GLRParser(g) results = p.parse("txbbbbb") assert len(results) == 5
def test_glr_recovery_custom_new_token(): """ Test that custom recovery that introduces new token works. """ def custom_recovery(head, error): # Here we will introduce missing operation token head.token_ahead = Token(g.get_terminal('-'), '-', head.position, length=0) return True parser = GLRParser(g, actions=actions, error_recovery=custom_recovery) results = parser.parse('1 + 5 8 - 2') assert len(parser.errors) == 1 assert len(results) == 5 result_set = set([parser.call_actions(tree) for tree in results]) assert len(result_set) == 2 assert -4 in result_set assert 0 in result_set
def test_cyclic_grammar_2(): """ Grammar G2 from the paper: "GLR Parsing for e-Grammers" by Rahman Nozohoor-Farshi Classic Tomita's GLR algorithm doesn't terminate with this grammar. parglare will succeed parsing but will report LoopError during any tree traversal as the built SPPF is circular. """ grammar = """ S: S S; S: 'x'; S: EMPTY; """ g = Grammar.from_string(grammar) with pytest.raises(SRConflicts): Parser(g, prefer_shifts=False) p = GLRParser(g) results = p.parse('xx') with pytest.raises(LoopError): len(results)
def test_bounded_ambiguity(): """ This grammar has bounded ambiguity. The language is the same: xb^n, n>=0 but each valid sentence will always have two derivations. Grammar G4 from: Nozohoor-Farshi, Rahman: "GLR Parsing for ε-Grammers" """ grammar = """ S: M | N; M: A M "b" | "x"; N: A N "b" | "x"; A: EMPTY; """ g = Grammar.from_string(grammar) p = GLRParser(g) results = p.parse("xbbb") assert len(results) == 2
def test_cyclic_grammar_3(): """ Grammar with indirect cycle. r:EMPTY->A ; r:A->S; r:EMPTY->A; r:SA->S; r:EMPTY->A; r:SA->S;... """ grammar = """ S: S A | A; A: "a" | EMPTY; """ g = Grammar.from_string(grammar) Parser(g) p = GLRParser(g) results = p.parse('aa') assert len(results) == 2 expected = [ ['a', 'a'], [[[], 'a'], 'a'] ] assert results == expected
def test_issue_112_wrong_error_report(): """ Test that token ahead is not among expected symbols in error message. """ grammar = r''' _input: sentence | standalonePhrase; standalonePhrase: interjection* __phrase; sentence: interjection* sentence1 | sentenceJoiningAdverb? sentence1; sentence1: subordinateClause* _clause sentenceEnd | subordinateClause* quotationShortForm; subordinateClause: _clause clauseConnector punctuation*; _clause: __phrase* verbPhrase | __phrase* complement? copulaPhrase; // ---- phrases ---- __phrase: topic | subject | object | adjectivalPhrase | adverbialPhrase | nounPhrase; topic: nounPhrase topicMarker; subject: nounPhrase subjectMarker; object: nounPhrase objectMarker; complement: nounPhrase complementMarker?; adjectivalPhrase: adjective+ nounPhrase; // ---- noun-related ---- nounPhrase: singleNounPhrase | combinedNounPhrase; combinedNounPhrase: singleNounPhrase continuedNounPhrase+; continuedNounPhrase: conjunction singleNounPhrase; singleNounPhrase: determiner singleNounPhrase1 auxiliaryParticle* punctuation* | singleNounPhrase1 auxiliaryParticle* punctuation*; singleNounPhrase1: basicNounPhrase | modifiedNounPhrase | countingPhrase; basicNounPhrase: noun+ | possessive; possessive: noun+ possessiveMarker; modifiedNounPhrase: basicNounPhrase nounModifyingSuffix; countingPhrase: basicNounPhrase number | basicNounPhrase number counter | number basicNounPhrase | counter possessiveMarker basicNounPhrase; noun: simpleNoun | nominalForm | nominalizedVerb | verbModifiedToNoun; nominalizedVerb: _clause nominalizingSuffix; verbModifiedToNoun: _clause verbToNounModifyingForm; adjective: _clause adnominalSuffix | possessive; // ---- verb-related ---- copulaPhrase: adverb* copula verbSuffix* predicateEndingSuffix?; verbPhrase: adverb* verbPhrase1 nominalVerbForm? verbSuffix* predicateEndingSuffix?; verbPhrase1: basicVerbPhrase | negative basicVerbPhrase | basicVerbPhrase negative; basicVerbPhrase: verbCombination | honorificVerb | verbAndAuxiliary | modifiedVerb | indirectQuotation | nominalAsVerb; verbCombination: verb | verb verbCombiner verbCombination; verb: simpleVerb | descriptiveVerb; honorificVerb: verb honorificMarker; verbAndAuxiliary: verb nominalVerbForm? verbSuffix* auxiliaryVerb+; modifiedVerb: verb honorificMarker? verbModifier | verbAndAuxiliary honorificMarker? verbModifier; nominalAsVerb: verb verbNominal | verbAndAuxiliary verbNominal; auxiliaryVerb: simpleAuxiliaryVerb honorificMarker? | auxiliaryVerbForm honorificMarker?; simpleAuxiliaryVerb: auxiliaryVerbConnector verb; adverbialPhrase: nounPhrase adverbialParticle auxiliaryParticle* | verb adverbialParticle auxiliaryParticle*; // ---- quotation forms ---- indirectQuotation: verb quotationSuffix; quotationShortForm: basicVerbPhrase shortQuotationSuffix verbSuffix* predicateEndingSuffix?; // ------ others ----- interjection: interjectionTerminal punctuation*; // --- terminal symbols ------------------------------ terminals sentenceEnd: /[^:]+:(SF);/; interjectionTerminal: /[^:]+:(IC);/; punctuation: /[^:]+:(SP|SS|SE|SO|SW|SWK);/; clauseConnector: /[^:]+:(EC|CCF|CCMOD|CCNOM);/; topicMarker: /[^:]+:(TOP);/; objectMarker: /[^:]+:(JKO);/; subjectMarker: /[^:]+:(JKS);/; complementMarker: /[^:]+:(JKC);/; conjunction: /[^:]+:(JC|CON);/; determiner: /[^:]+:(MM);/; auxiliaryParticle: /[^:]+:(JX);/; possessiveMarker: /[^:]+:(JKG);/; nounModifyingSuffix: /[^:]+:(XSN|JKV);/; nominalizingSuffix: /[^:]+:(ETN);/; adnominalSuffix: /[^:]+:(ETM);/; verbSuffix: /[^:]+:(EP|TNS);/; predicateEndingSuffix: /[^:]+:(SEF|EF);/; negative: /[^:]+:(NEG);/; verbCombiner: /고:(EC|CCF);/; honorificMarker: /(으시|시):EP;/; verbModifier: /[^:]+:(VMOD);/; verbNominal: /[^:]+:(VNOM);/; adverbialParticle: /[^:]+:(JKB);/; quotationSuffix: /[^:]+:(QOT);/; shortQuotationSuffix: /[^:]+:(SQOT);/; sentenceJoiningAdverb: /[^:]+:MAJ;/; simpleNoun: /[^:]+:(NNG|NNP|NNB|NR|SL|NP|SN);/; adverb: /[^:]+:(MAG);/; simpleVerb: /[^:]+:(VV|VVD|VHV);/; descriptiveVerb: /[^:]+:(VA|VCP|VCN|VAD|VHA);/; auxiliaryVerbConnector: /[^:]+:(EC);/; auxiliaryVerbForm: /[^:]+:(EC);/; copula: /(되:VV)|([^:]+:(VCP|VCN));/; number: /[^:]+:(SN|NR);/; counter: /[^:]+:(NNB|NNG);/; nominalForm: /[^:]+:(NNOM);/; verbToNounModifyingForm: /[^:]+:(NMOD);/; nominalVerbForm: /[^:]+:(VNOM);/; ''' # noqa g = Grammar.from_string(grammar) parser = GLRParser(g) with pytest.raises(ParseError) as e: parser.parse('공부하:VHV; 는:ETM; 것:NNB; 은:TOP; 아니:VCN; ㅂ니다:SEF; .:SF;') assert 'Expected: adnominalSuffix or nominalizingSuffix or '\ 'verbToNounModifyingForm but found <sentenceEnd(.:SF;)>'\ in str(e.value)
def test_glr_forest_disambiguation(): parser = GLRParser(Grammar.from_string(grammar)) forest = parser.parse(r''' part1 part2 part3 part2 part1 part3 part2 part3 part1 part2 part3 part1 part2 ''') # We have 415 solutions. assert len(forest) == 415 assert forest.ambiguities == 46 forest.disambiguate(disambiguate) # After the disambiguation, only one solution remains. assert len(forest) == 1 assert forest.to_str().strip() == r''' document[5->147] parts[5->147] part1_1[5->147] part1_1[5->126] part1_1[5->93] part1_1[5->49] part1[5->49] title1[5->10, "part1"] parts_opt[16->49] parts[16->49] part2_1[16->49] part2_1[16->39] part2[16->39] title2[16->21, "part2"] parts_opt[28->39] parts[28->39] part3_1[28->39] part3[28->39] title3[28->33, "part3"] parts_opt[39->39] part2[39->49] title2[39->44, "part2"] parts_opt[49->49] part1[49->93] title1[49->54, "part1"] parts_opt[60->93] parts[60->93] part3_1[60->93] part3_1[60->83] part3[60->83] title3[60->65, "part3"] parts_opt[72->83] parts[72->83] part2_1[72->83] part2[72->83] title2[72->77, "part2"] parts_opt[83->83] part3[83->93] title3[83->88, "part3"] parts_opt[93->93] part1[93->126] title1[93->98, "part1"] parts_opt[104->126] parts[104->126] part2_1[104->126] part2[104->126] title2[104->109, "part2"] parts_opt[116->126] parts[116->126] part3_1[116->126] part3[116->126] title3[116->121, "part3"] parts_opt[126->126] part1[126->147] title1[126->131, "part1"] parts_opt[137->147] parts[137->147] part2_1[137->147] part2[137->147] title2[137->142, "part2"] parts_opt[147->147] '''.strip()
from parglare import GLRParser from parglare.tables.persist import table_from_serializable from _table import table from grammar import grammar table = table_from_serializable(table, grammar) parser = GLRParser(grammar, table=table) print(parser.parse('aaabbb'))
INPUT = '1 + 2 * 3 + 4' grammar = r''' E: E '+' E | E '*' E | '(' E ')' | number; terminals number: /\d+/; ''' g = Grammar.from_string(grammar) parser = GLRParser(g, build_tree=True) result = parser.parse(INPUT) def tree_str(node, depth=0): indent = ' ' * depth if isinstance(node, NodeNonTerm): s = '\n{}[.{} {}\n{}]'.format( indent, node.production.symbol, ''.join([tree_str(n, depth + 1) for n in node.children]), indent) else: s = '\n{}[.{} ]'.format(indent, node.value) return s with open('qtree_out.txt', 'w') as f: f.write('\begin{{tabular}}{{{}}}\n'.format('c' * len(result)))
class CParser: def __init__(self): self._glr = None self._setup_parser() self.user_defined_types = set() def _setup_parser(self): """Setup parser.""" file_path = os.path.realpath(os.path.dirname(__file__)) root_path = os.path.split(os.path.abspath(os.path.join(file_path)))[0] grammar_path = os.path.join(root_path, "cparser", "cgrammar.pg") grammar = Grammar.from_file(grammar_path) def typedef_filter(context, action, subresults): """Filter for dynamic disambiguation Solves problems with typedef_name disambiguation. Whenever the REDUCE is called on typedef_name rule, we first check if the ID that is trying to be reduced is actually a user-defined type (struct, union, typedef). If yes, than the REDUCE will be called. """ if action is None: return production = context.production if action is REDUCE and production.symbol.fqn == "typedef_name": var_name = subresults[0].value if var_name not in self.user_defined_types: return False if action is REDUCE and production.symbol.fqn == "primary_exp": child = subresults[0] if child.symbol.fqn == "id": if child.value in self.user_defined_types: return False if action is REDUCE and production.symbol.fqn == "iteration_stat": if isrule(subresults[2], "decl_body"): init_declarator_list_opt = subresults[2].children[1] if len(init_declarator_list_opt.children) == 0: return False return True self._glr = GLRParser(grammar, build_tree=True, call_actions_during_tree_build=True, dynamic_filter=typedef_filter, actions=self._setup_actions(), ws='\n\r\t ') def _setup_actions(self): """Creates a dict of semantic actions that will be called during parsing Returns: dict """ def decl_body(_, nodes): """Semantic action called for every decl_body production This semantic action is used to collect every user-defined type in a code. This includes structs, unions and typedefs. """ def collect_direct_decl_name(init_dcl): """Adds the name of direct declarator into the set of user-defined types""" declarator = init_dcl.children[0] if isrule(declarator.children[0], "direct_declarator"): direct_declarator = declarator.children[0] else: # in case of pointer, declarator is a second # child direct_declarator = declarator.children[1] if isinstance(direct_declarator.children[0], NodeTerm): value = direct_declarator.children[0].value self.user_defined_types.add(value) def recurse_init_decl(init_dcl): """Recurses through the init declarator rule.""" if len(init_dcl.children) > 1: # last child is always direct declarator collect_direct_decl_name(init_dcl.children[-1]) # first child is always recursive init_declarator_1_comma recurse_init_decl(init_dcl.children[0]) else: collect_direct_decl_name(init_dcl.children[0]) decl_specs = nodes[0] first_el = decl_specs.children[0] if isrule(first_el, "storage_class_spec"): if first_el.children[0].value == "typedef": # If the current decl_specs is definition of custom type by # using 'typedef', get the name of the defined type. init_decl_list_opt = nodes[1] if not init_decl_list_opt.children: return init_decl_list = init_decl_list_opt.children[0] for init_decl in init_decl_list.children: recurse_init_decl(init_decl) # Productions that start with type_spec if isrule(first_el, "type_spec"): type_spec_children = first_el.children ts_first = type_spec_children[0] if isrule(ts_first, "struct_or_union_spec"): struct_name = ts_first.children[1].value self.user_defined_types.add(struct_name) return {"decl_body": decl_body} def parse(self, code, debug=False): """Parses the given code string.""" self.user_defined_types = set() self._glr.debug = debug results = self._glr.parse(code) return results[0] def parse_file(self, file_path, use_cpp=False, cpp_path="cpp", cpp_args=None, debug=False): """Parses content from the given file.""" # self.user_defined_types = set() # self._glr.debug = debug if use_cpp: content = preprocess_file(file_path, cpp_path, cpp_args) else: with open(file_path) as f: content = f.read() return self.parse(content, debug)
def test_expressions(): actions = { "E": [ lambda _, nodes: nodes[0] + nodes[2], lambda _, nodes: nodes[0] * nodes[2], lambda _, nodes: nodes[1], lambda _, nodes: int(nodes[0]) ] } # This grammar is highly ambiguous if priorities and # associativities are not defined to disambiguate. grammar = """ E: E "+" E | E "*" E | "(" E ")" | /\d+/; """ g = Grammar.from_string(grammar) p = GLRParser(g, actions=actions, debug=True) # Even this simple expression has 2 different interpretations # (4 + 2) * 3 and # 4 + (2 * 3) results = p.parse("4 + 2 * 3") assert len(results) == 2 assert 18 in results and 10 in results # Adding one more operand rises number of interpretations to 5 results = p.parse("4 + 2 * 3 + 8") assert len(results) == 5 # One more and there are 14 interpretations results = p.parse("4 + 2 * 3 + 8 * 5") assert len(results) == 14 # The number of interpretation will be the Catalan number of n # where n is the number of operations. # https://en.wikipedia.org/wiki/Catalan_number # This number rises very fast. For 10 operations number of interpretations # will be 16796! # If we rise priority for multiplication operation we reduce ambiguity. # Default production priority is 10. Here we will raise it to 15 for # multiplication. grammar = """ E: E "+" E | E "*" E {15}| "(" E ")" | /\d+/; """ g = Grammar.from_string(grammar) p = GLRParser(g, actions=actions) # This expression now has 2 interpretation: # (4 + (2*3)) + 8 # 4 + ((2*3) + 8) # This is due to associativity of + operation which is not defined. results = p.parse("4 + 2 * 3 + 8") assert len(results) == 2 # If we define associativity for both + and * we have resolved all # ambiguities in the grammar. grammar = """ E: E "+" E {left}| E "*" E {left, 15}| "(" E ")" | /\d+/; """ g = Grammar.from_string(grammar) p = GLRParser(g, actions=actions) results = p.parse("4 + 2 * 3 + 8 * 5 * 3") assert len(results) == 1 assert results[0] == 4 + 2 * 3 + 8 * 5 * 3
def test_issue_114_empty_and_lexical_ambiguity(): g = Grammar.from_string(grammar) parser = GLRParser(g, build_tree=True) results = parser.parse("a car is a kind of vehicle.") assert len(results) == 2 expected = r''' Sentence[0->27] KindDefinitionSentence[0->27] a[0->1, "a"] IdentifierWord_0[2->5] IdentifierWord_1[2->5] IdentifierWord[2->5, "car"] is[6->8, "is"] a[9->10, "a"] kind[11->15, "kind"] of[16->18, "of"] IdentifierWord_0[19->26] IdentifierWord_1[19->26] IdentifierWord[19->26, "vehicle"] KindWith_opt[26->26] DOT[26->27, "."] Sentence[0->27] OtherSentence[0->27] IdentifierWord_0[0->26] IdentifierWord_1[0->26] IdentifierWord_1[0->18] IdentifierWord_1[0->15] IdentifierWord_1[0->10] IdentifierWord_1[0->8] IdentifierWord_1[0->5] IdentifierWord_1[0->1] IdentifierWord[0->1, "a"] IdentifierWord[2->5, "car"] IdentifierWord[6->8, "is"] IdentifierWord[9->10, "a"] IdentifierWord[11->15, "kind"] IdentifierWord[16->18, "of"] IdentifierWord[19->26, "vehicle"] DOT[26->27, "."] ''' assert '\n\n'.join([r.tree_str() for r in results]).strip() == expected.strip() results = parser.parse("a car is a kind of vehicle with wheels.") assert len(results) == 3 expected = r''' Sentence[0->39] KindDefinitionSentence[0->39] a[0->1, "a"] IdentifierWord_0[2->5] IdentifierWord_1[2->5] IdentifierWord[2->5, "car"] is[6->8, "is"] a[9->10, "a"] kind[11->15, "kind"] of[16->18, "of"] IdentifierWord_0[19->38] IdentifierWord_1[19->38] IdentifierWord_1[19->31] IdentifierWord_1[19->26] IdentifierWord[19->26, "vehicle"] IdentifierWord[27->31, "with"] IdentifierWord[32->38, "wheels"] KindWith_opt[38->38] DOT[38->39, "."] Sentence[0->39] KindDefinitionSentence[0->39] a[0->1, "a"] IdentifierWord_0[2->5] IdentifierWord_1[2->5] IdentifierWord[2->5, "car"] is[6->8, "is"] a[9->10, "a"] kind[11->15, "kind"] of[16->18, "of"] IdentifierWord_0[19->26] IdentifierWord_1[19->26] IdentifierWord[19->26, "vehicle"] KindWith_opt[27->38] KindWith[27->38] with[27->31, "with"] IdentifierWord_0[32->38] IdentifierWord_1[32->38] IdentifierWord[32->38, "wheels"] DOT[38->39, "."] Sentence[0->39] OtherSentence[0->39] IdentifierWord_0[0->38] IdentifierWord_1[0->38] IdentifierWord_1[0->31] IdentifierWord_1[0->26] IdentifierWord_1[0->18] IdentifierWord_1[0->15] IdentifierWord_1[0->10] IdentifierWord_1[0->8] IdentifierWord_1[0->5] IdentifierWord_1[0->1] IdentifierWord[0->1, "a"] IdentifierWord[2->5, "car"] IdentifierWord[6->8, "is"] IdentifierWord[9->10, "a"] IdentifierWord[11->15, "kind"] IdentifierWord[16->18, "of"] IdentifierWord[19->26, "vehicle"] IdentifierWord[27->31, "with"] IdentifierWord[32->38, "wheels"] DOT[38->39, "."] ''' assert '\n\n'.join([r.tree_str() for r in results]).strip() == expected.strip()
def test_nops(): """ Test that nops (no prefer shifts) will honored per rule. """ grammar = """ Program: "begin" statements=Statements ProgramEnd EOF; Statements: Statements1 | EMPTY; Statements1: Statements1 Statement | Statement; ProgramEnd: End; Statement: End "transaction" | "command"; terminals End: "end"; """ g = Grammar.from_string(grammar, ignore_case=True) parser = GLRParser(g, build_tree=True, prefer_shifts=True) # Here we have "end transaction" which is a statement and "end" which # finish program. Prefer shift strategy will make parser always choose to # shift "end" in anticipation of "end transaction" statement instead of # reducing by "Statements" and finishing. with pytest.raises(ParseError): parser.parse(""" begin command end transaction command end transaction command end """) # When {nops} is used, GLR parser will investigate both possibilities at # this place and find the correct interpretation while still using # prefer_shift strategy globaly. grammar = """ Program: "begin" statements=Statements ProgramEnd EOF; Statements: Statements1 {nops} | EMPTY; Statements1: Statements1 Statement | Statement; ProgramEnd: End; Statement: End "transaction" | "command"; terminals End: "end"; """ g = Grammar.from_string(grammar, ignore_case=True) parser = GLRParser(g, build_tree=True, prefer_shifts=True) parser.parse(""" begin command end transaction command end transaction command end """)
def test_lexical_ambiguity2(): g = Grammar.from_string(r''' Stuff: Stuff "+" Stuff | Something; Something: INT | FLOAT | Object; Object: INT DOT INT; terminals INT: /\d+/; FLOAT: /\d+(\.\d+)?/; DOT: "."; ''') parser = GLRParser(g) # Lexical ambiguity between FLOAT and INT . INT forest = parser.parse('42.12') assert len(forest) == 2 assert forest.ambiguities == 1 # Here also we have two ambiguities forest = parser.parse('42.12 + 3.8') assert len(forest) == 4 assert forest.ambiguities == 2 # Here we have 3 lexical ambiguities and 1 ambiguity # for + operation forest = parser.parse('34.78 + 8 + 3.3') assert len(forest) == 16 assert forest.ambiguities == 4 # Here we have 4 lexical ambiguities and 3 ambiguities # for + operation therefore 5 * 2 ^ 4 solutions forest = parser.parse('34.78 + 8 + 3.3 + 1.2') assert len(forest) == 80 assert forest.ambiguities == 7 # When default lexical disambiguation is activated # We should have only syntactical ambiguities where # default lexical disambiguation can resolve parser = GLRParser(g, lexical_disambiguation=True) # Longest match is used to choose FLOAT forest = parser.parse('42.12') assert len(forest) == 1 forest[0].symbol.name == 'FLOAT' assert forest.ambiguities == 0 # Also, longest match will choose FLOAT in both cases forest = parser.parse('42.12 + 3.8') assert len(forest) == 1 assert forest.ambiguities == 0 # Here we still have lexical ambiguity on "8" forest = parser.parse('34.78 + 8 + 3.3') assert len(forest) == 4 assert forest.ambiguities == 2 # Lexical ambiguity on "8" and 3 syntactical ambiguities # on + operations forest = parser.parse('34.78 + 8 + 3.3 + 1.2') assert len(forest) == 10 assert forest.ambiguities == 4