def test_04_non_begin_anchor(self): s = miniscan.Definition() s.token( 'word', '^^\w+' ) # Yield only those words found NOT at the beginning of lines. s.ignore('\s+') # Skip spaces s.ignore('\S+') # Skip other sequences of non-spaces. self.semantics(['banana', 'orange', 'vegetable', 'mineral'], s, 'apple banana orange\nanimal vegetable mineral')
def test_09_forgotten_action(self): s = miniscan.Definition() s.token('ernie', 'ernie$') # match ernie, but only at the end. s.on( r'bert/\s+and' ) # match bert, but only if " and" follows. However, forget to provide an action, with self.assertRaises(AssertionError): s.on( '.' ) # triggering an exception at the next attempt to define a pattern.
def test_06_simple_trailing_context(self): s = miniscan.Definition() s.token( 'stem', '\w+/ing' ) # Yield the stems of gerunds. Sort of. "Thing" is not a gerund. s.ignore('\w+') # Skip words not matched above s.ignore('\s+') # Skip spaces s.ignore('\S') # Skip non-spaces, one at a time. self.semantics(['eat', 'drink'], s, 'There was eating, drinking, and merriment all around.')
def test_10_charclass_intersection(self): """ Exercise the canonical "consonants" example. """ s = miniscan.Definition() s.let('vowel', r'[AEIOUaeiou]') s.let('consonant', r'[{alpha}&&^{vowel}]') s.token('consonant', '{consonant}+') s.ignore('{ANY}') original_text = 'To sit in solemn silence on a dull dark dock,' result = '-'.join(t[1] for t in s.scan(original_text)) expect = 'T-s-t-n-s-l-mn-s-l-nc-n-d-ll-d-rk-d-ck' self.assertEqual(expect, result)
def test_07_variable_trail_on_fixed_stem(self): s = miniscan.Definition() s.token('stem', 'eat/ing|en|s') # Yield the stems of eat-forms s.ignore('\s+') # Skip spaces s.ignore('\S') # Skip non-spaces, one at a time. self.semantics( [ 'eat', ], s, 'There was eating, drinking, and merriment all around, but the man did not eat.' )
def test_05_eol_anchor(self): s = miniscan.Definition() s.token('work', '\w+$') # Yield only those words found at the ends of lines. # Note that the end-of-text also counts as an end-of-line zone; this is NOT strictly looking for \n. s.ignore('\s+') # Skip spaces s.ignore('\S+') # Skip other sequences of non-spaces. expect = ['orange', 'mineral'] self.semantics( expect, s, 'apple banana orange\nanimal vegetable mineral') # Unix-style self.semantics(expect, s, 'apple banana orange\ranimal vegetable mineral' ) # Apple-Classic Style self.semantics( expect, s, 'apple banana orange\r\nanimal vegetable mineral') # Dos-style
def test_03_begin_anchor(self): s = miniscan.Definition() s.token( 'word', '^\w+') # Yield only those words found at the beginning of lines. s.ignore( '[\s\S]') # Skip all other characters, one character at a time. expect = ['apple', 'animal'] self.semantics( expect, s, 'apple banana orange\nanimal vegetable mineral') # Unix-style self.semantics(expect, s, 'apple banana orange\ranimal vegetable mineral' ) # Apple-Classic Style self.semantics( expect, s, 'apple banana orange\r\nanimal vegetable mineral') # Dos-style
def test_01_simple_tokens_with_rank_feature(self): s = miniscan.Definition() s.ignore( '\s+') # Ignore spaces except inasmuch as they separate tokens. s.token('word', '\w+') # The digits are included in the \w shorthand, s.token_map( 'number', '\d+', int, rank=1 ) # but the higher rank (than default zero) makes numbers stand out. self.assertEqual( [ ('word', 'abc'), ('number', 123), ('word', 'def456'), ('number', 789), ('word', 'XYZ'), ], list(s.scan(' abc 123 def456 789XYZ ')), )
""" JSON is JavaScript Object Notation. See http://www.json.org/ for more. Python has a standard library for JSON, so this is just a worked example. """ from boozetools.parsing import miniparse from boozetools.scanning import miniscan from boozetools.support.interfaces import Scanner ################################################################################### # Begin with a scanner definition: ################################################################################### # Define a scanner. lexemes = miniscan.Definition() # A few named subexpressions make the rest considerably easier to read (and write). lexemes.let('wholeNumber', r'[1-9]\d*') lexemes.let('signedInteger', r'-?(0|{wholeNumber})') lexemes.let('fractionalPart', r'\.\d+') lexemes.let('exponent', r'[Ee][-+]?\d+') # Now we can write some pattern/action pairs. # The miniscan module offers several ways. # One way is as a decorator for an arbitrary function: # This is convenient if significant computation determines which token # (or indeed, how many tokens) to emit. @lexemes.on('{signedInteger}') def match_integer(yy: Scanner): # It's sort of assumed you'll be connecting a mini-scanner up to a mini-parser. # The parser module expects to get (token, value, start, end) quads, but the # scanner handles the start and end. You just call the `.token(...)` method
def test_08_trailing_context_gets_put_back(self): s = miniscan.Definition() s.token('stem', r'\d/\d') s.ignore(r'.') expect = list('12') self.semantics(expect, s, '123')