def endLine(): # match - Line terminator (windows and unix style) return RegExMatch(r'({})?'.format(lineSep))
def arg_identifier(): return RegExMatch(r"[a-zA-Z_]([a-zA-Z_]|[0-9])*")
def comment(): return "//", RegExMatch(".*\n")
def roman(): return RegExMatch(r'(I[VX]|VI{0,3}|I{1,3})([Aa]?[Bb]?)') def prefix(): return ['esd', 'sd', 'd', 'g', 'c']
def identifier(): return OneOrMore(RegExMatch('[a-z][a-zA-Z_0-9]*'))
def til_eol(): return RegExMatch(r".*")
def number(): return RegExMatch(r'[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?')
def package(): # match - package ('")PACKAGE('")(;) return RegExMatch(r'package\s?[\'\"]?.*[\'\"]?\;?'), OneOrMore(endLine)
def pkgImports(): # match - import ['"]PACKAGE['"](;) return RegExMatch(r'import\s?[\'\"].*[\'\"]\;?'), OneOrMore(endLine)
def commentLine(): # match - any character, non line terminator return '//', RegExMatch(r'.*')
def syntax(): # match - syntax = ['"]SYNTAX['"](;) return RegExMatch(r'syntax\s?=\s?[\'\"].*[\'\"]\;?'), OneOrMore( endLine)
def commentBlock(): # match - any characters (line terminators included) enclosed with block quote signifier (/* and */) return RegExMatch(r'\/\*(.|{})*?\*\/'.format(lineSep)),
def string(): # match - any characters enclosed with single/double quotes return RegExMatch(r'[\'\"].*?[\'\"]')
def number(): # match - numbers with and without decimals return RegExMatch(r'\d*\.\d*|\d+')
def identifier(): return RegExMatch(r"[_A-Za-z][A-Za-z0-9_'#]*")
def grammar(): return "one", "two", RegExMatch(r'\d+'), "three"
def breakline(): return "--", RegExMatch(r"-*"), "\n"
def scenarios(rule, s): # -> grammar, text, expect assert FAKE_SPACE not in s, "INTERNAL ERROR, chosen 'fake space' in string to be tested !" rule = deepcopy(rule) catchall = RegExMatch(r'.*', rule_name='catch_all') newline = RegExMatch(r'[\n]', rule_name='newline') # t_newline = Terminal(newline, 0, '\n') t_eof = Terminal(EOF(), 0, '') t_s = Terminal(rule, 0, s) def grammar(_words): body = OneOrMore(OrderedChoice([rule, _words, catchall, newline])) return Sequence((body, EOF)) def itself(): name = f"by_itself" return name, Sequence((rule, EOF)), s, (t_s, ) def at_start(): name = 'at_start_followed_by_phrase' phrase = fake_spaces_etc(s, 'now is the time') assert s not in phrase _words = get_words(s) text = ''.join([s, phrase]) expect = (t_s, Terminal(_words(), 0, phrase)) return name, grammar(_words), text, expect def at_start_twice(): name = 'at_start_followed_by_phrase_twice' # 's' at start followed by a phrase, TWICE phrase = fake_spaces_etc(s, 'now is the time') assert s not in phrase _words = get_words(s) text = ''.join([*((s, phrase) * 2)]) t_phrase = Terminal(_words(), 0, phrase) expect = (*((t_s, t_phrase) * 2), ) return name, grammar(_words), text, expect def at_start_two_lines(): name = 'at_start_followed_by_phrase_two_lines' phrase = fake_spaces_etc(s, 'now is the time' + '\n') assert s not in phrase _words = get_words(s) text = ''.join([*((s, phrase) * 2)]) t_phrase = Terminal(_words(), 0, phrase) expect = (*((t_s, t_phrase) * 2), ) return name, grammar(_words), text, expect # !@# def in_the_middle(): name = 'in_the_middle_between_two_phrases' left_phrase = fake_spaces_etc(s, 'for all good men') right_phrase = fake_spaces_etc(s, 'to rise up') assert s not in left_phrase assert s not in right_phrase _words = get_words(s) text = ''.join([left_phrase, s, right_phrase]) t_left = Terminal(_words(), 0, left_phrase) t_right = Terminal(_words(), 0, right_phrase) expect = (t_left, t_s, t_right) return name, grammar(_words), text, expect def at_end(): name = 'at_end_preceeded_by_a_phrase' phrase = fake_spaces_etc(s, 'in defense of freedom') assert s not in phrase _words = get_words(s) t_phrase = Terminal(_words(), 0, phrase) text = ''.join([s, phrase]) expect = ( t_s, t_phrase, ) return name, grammar(_words), text, expect #-------------------------------------------------------------------------- def paragraph(): name = 'several_occurances_in_a_paragraph' text = """<s> <s>The essence of America — that which really unites us — <s>is not ethnicity, <s>or<s>nationality, or religion. It is an <s> idea—and what an <s> idea it is : that you can come <s><s> from humble circumstances and do great things.<s> - Condoleezza Rice <s>""" # zero length phrases at start, end and one more in the middle n_empty = 3 text = text.replace('<s>', chr(7)) text = fake_spaces_etc(s, text) text = text.replace(chr(7), '<s>') assert s not in text _words = get_words(s) phrases = re.split('<s>', text) assert len(phrases[0]) == 0 assert len(phrases[-1]) == 0 t_s = Terminal(rule, 0, s) tw = lambda p: Terminal(_words(), 0, p) terms = [((tw(p) if len(p) > 0 else ()), t_s) for p in phrases] terms = flatten(terms) del terms[-1] assert len(terms) == 2 * len(phrases) - n_empty - 1 # Handle the simplest Zero/One Or Many rules on a character class # if isinstance(rule, RegExMatch) and rule.to_match[-1] in '*+': # collapse any series of 't_s' elements into a single ts element limit = len(terms) - 1 idx = 0 while idx < limit: if (terms[idx].rule_name == t_s.rule_name and terms[idx + 1].rule_name == t_s.rule_name): value = terms[idx].value + terms[idx + 1].value terms[idx] = Terminal(rule, 0, value) del terms[idx + 1] limit -= 1 else: idx += 1 return name, grammar(_words), s.join(phrases), tuple(terms) #-------------------------------------------------------------------------- # !@# tests = [ itself, at_start, at_start_twice, at_start_two_lines, in_the_middle, at_end, paragraph, ] for test in tests: yield test()
def ident(): return RegExMatch(r"\w+")
def words(): return RegExMatch(WORDS_CHARACTER_REGEX_WITH_LF + '+', rule_name='words')
def numeric(): return RegExMatch(r'[0-9]+(\.[0-9]*)?') def roman(): return RegExMatch(r'(I[VX]|VI{0,3}|I{1,3})([Aa]?[Bb]?)')
def value(): return RegExMatch(r"[A-Za-z0-9_]+")
def integer(): return Optional('-'), [RegExMatch('\d*\.\d'), RegExMatch('\d+')]
def num(): return RegExMatch(r"[0-9]+")
def comment(): return RegExMatch('#.*')
def wsl(): return RegExMatch(r"\s*")
def str_match(): return RegExMatch(r'''(?s)('[^'\\]*(?:\\.[^'\\]*)*')|''' r'''("[^"\\]*(?:\\.[^"\\]*)*")''')
def ws(): return RegExMatch(r"[\t ]+")
def open_brace() -> GrammarType: return RegExMatch(r'\s*{')
def wrappedDef(): return ('struct', RegExMatch(r'[\w\d]+'), '{', Optional(repeatedDef), '}')