Ejemplo n.º 1
0
 def test_04_non_begin_anchor(self):
     s = miniscan.Definition()
     s.token(
         'word', '^^\w+'
     )  # Yield only those words found NOT at the beginning of lines.
     s.ignore('\s+')  # Skip spaces
     s.ignore('\S+')  # Skip other sequences of non-spaces.
     self.semantics(['banana', 'orange', 'vegetable', 'mineral'], s,
                    'apple banana orange\nanimal vegetable mineral')
Ejemplo n.º 2
0
 def test_09_forgotten_action(self):
     s = miniscan.Definition()
     s.token('ernie', 'ernie$')  # match ernie, but only at the end.
     s.on(
         r'bert/\s+and'
     )  # match bert, but only if " and" follows. However, forget to provide an action,
     with self.assertRaises(AssertionError):
         s.on(
             '.'
         )  # triggering an exception at the next attempt to define a pattern.
Ejemplo n.º 3
0
 def test_06_simple_trailing_context(self):
     s = miniscan.Definition()
     s.token(
         'stem', '\w+/ing'
     )  # Yield the stems of gerunds. Sort of. "Thing" is not a gerund.
     s.ignore('\w+')  # Skip words not matched above
     s.ignore('\s+')  # Skip spaces
     s.ignore('\S')  # Skip non-spaces, one at a time.
     self.semantics(['eat', 'drink'], s,
                    'There was eating, drinking, and merriment all around.')
Ejemplo n.º 4
0
 def test_10_charclass_intersection(self):
     """ Exercise the canonical "consonants" example. """
     s = miniscan.Definition()
     s.let('vowel', r'[AEIOUaeiou]')
     s.let('consonant', r'[{alpha}&&^{vowel}]')
     s.token('consonant', '{consonant}+')
     s.ignore('{ANY}')
     original_text = 'To sit in solemn silence on a dull dark dock,'
     result = '-'.join(t[1] for t in s.scan(original_text))
     expect = 'T-s-t-n-s-l-mn-s-l-nc-n-d-ll-d-rk-d-ck'
     self.assertEqual(expect, result)
Ejemplo n.º 5
0
 def test_07_variable_trail_on_fixed_stem(self):
     s = miniscan.Definition()
     s.token('stem', 'eat/ing|en|s')  # Yield the stems of eat-forms
     s.ignore('\s+')  # Skip spaces
     s.ignore('\S')  # Skip non-spaces, one at a time.
     self.semantics(
         [
             'eat',
         ], s,
         'There was eating, drinking, and merriment all around, but the man did not eat.'
     )
Ejemplo n.º 6
0
 def test_05_eol_anchor(self):
     s = miniscan.Definition()
     s.token('work',
             '\w+$')  # Yield only those words found at the ends of lines.
     # Note that the end-of-text also counts as an end-of-line zone; this is NOT strictly looking for \n.
     s.ignore('\s+')  # Skip spaces
     s.ignore('\S+')  # Skip other sequences of non-spaces.
     expect = ['orange', 'mineral']
     self.semantics(
         expect, s,
         'apple banana orange\nanimal vegetable mineral')  # Unix-style
     self.semantics(expect, s,
                    'apple banana orange\ranimal vegetable mineral'
                    )  # Apple-Classic Style
     self.semantics(
         expect, s,
         'apple banana orange\r\nanimal vegetable mineral')  # Dos-style
Ejemplo n.º 7
0
 def test_03_begin_anchor(self):
     s = miniscan.Definition()
     s.token(
         'word',
         '^\w+')  # Yield only those words found at the beginning of lines.
     s.ignore(
         '[\s\S]')  # Skip all other characters, one character at a time.
     expect = ['apple', 'animal']
     self.semantics(
         expect, s,
         'apple banana orange\nanimal vegetable mineral')  # Unix-style
     self.semantics(expect, s,
                    'apple banana orange\ranimal vegetable mineral'
                    )  # Apple-Classic Style
     self.semantics(
         expect, s,
         'apple banana orange\r\nanimal vegetable mineral')  # Dos-style
Ejemplo n.º 8
0
    def test_01_simple_tokens_with_rank_feature(self):
        s = miniscan.Definition()
        s.ignore(
            '\s+')  # Ignore spaces except inasmuch as they separate tokens.
        s.token('word', '\w+')  # The digits are included in the \w shorthand,
        s.token_map(
            'number', '\d+', int, rank=1
        )  # but the higher rank (than default zero) makes numbers stand out.

        self.assertEqual(
            [
                ('word', 'abc'),
                ('number', 123),
                ('word', 'def456'),
                ('number', 789),
                ('word', 'XYZ'),
            ],
            list(s.scan(' abc   123  def456  789XYZ ')),
        )
Ejemplo n.º 9
0
""" JSON is JavaScript Object Notation. See http://www.json.org/ for more.
Python has a standard library for JSON, so this is just a worked example. """

from boozetools.parsing import miniparse
from boozetools.scanning import miniscan
from boozetools.support.interfaces import Scanner

###################################################################################
#  Begin with a scanner definition:
###################################################################################

# Define a scanner.
lexemes = miniscan.Definition()

# A few named subexpressions make the rest considerably easier to read (and write).
lexemes.let('wholeNumber', r'[1-9]\d*')
lexemes.let('signedInteger', r'-?(0|{wholeNumber})')
lexemes.let('fractionalPart', r'\.\d+')
lexemes.let('exponent', r'[Ee][-+]?\d+')


# Now we can write some pattern/action pairs.
# The miniscan module offers several ways.
# One way is as a decorator for an arbitrary function:
# This is  convenient if significant computation determines which token
# (or indeed, how many tokens) to emit.
@lexemes.on('{signedInteger}')
def match_integer(yy: Scanner):
    # It's sort of assumed you'll be connecting a mini-scanner up to a mini-parser.
    # The parser module expects to get (token, value, start, end) quads, but the
    # scanner handles the start and end. You just call the `.token(...)` method
Ejemplo n.º 10
0
 def test_08_trailing_context_gets_put_back(self):
     s = miniscan.Definition()
     s.token('stem', r'\d/\d')
     s.ignore(r'.')
     expect = list('12')
     self.semantics(expect, s, '123')