Esempio n. 1
0
def test_limit():
    # Force a limit on input symbols.  If we only accept only even b's, we'll
    # fail if we force a stoppage at a+b*9
    source			= cpppo.peekable( str( 'a'+'b'*100 ))
    data			= cpppo.dotdict()
    try:
        with cpppo.regex( initial=str( 'a(bb)*' ), context='even_b', limit=10 ) as machine:
            for i,(m,s) in enumerate( machine.run( source=source, data=data )):
                log.info( "%s #%3d -> %10.10s; next byte %3d: %-10.10r: %r", m.name_centered(),
                          i, s, source.sent, source.peek(), data )
    except cpppo.NonTerminal:
        assert i == 10
        assert source.sent == 10
    else:
        assert False, "Should have failed with a cpppo.NonTerminal exception"


    # But odd b's OK
    for limit in [
            10, 
            '..somewhere.ten',
            lambda **kwds: 10, 
            lambda path=None, data=None, **kwds: data[path+'..somewhere.ten'] ]:
        source			= cpppo.peekable( str( 'a'+'b'*100 ))
        data			= cpppo.dotdict()
        data['somewhere.ten']	= 10
        with cpppo.regex( initial=str( 'ab(bb)*' ), context='odd_b', limit=limit ) as machine:
            for i,(m,s) in enumerate( machine.run( source=source, data=data )):
                log.info( "%s #%3d -> %10.10s; next byte %3d: %-10.10r: %r", m.name_centered(),
                          i, s, source.sent, source.peek(), data )
            assert i == 10
            assert source.sent == 10
            assert ( data.odd_b.input.tostring()
                     if sys.version_info[0] < 3
                     else data.odd_b.input.tounicode() ) == str( 'a'+'b'*9 )
Esempio n. 2
0
def main():
    """The basic examples in the README"""

    # Basic DFA that accepts ab+
    E			= cpppo.state( 'E' )
    A			= cpppo.state_input( 'A' )
    B			= cpppo.state_input( 'B', terminal=True )
    E['a']		= A
    A['b']		= B
    B['b']		= B

    BASIC		= cpppo.dfa( 'ab+', initial=E, context='basic' )

    # Composite state machine accepting ab+, ignoring ,[ ]* separators
    ABP			= cpppo.dfa( 'ab+', initial=E, terminal=True )
    SEP			= cpppo.state_drop( 'SEP' )
    ABP[',']		= SEP
    SEP[' ']		= SEP
    SEP[None]		= ABP

    CSV			= cpppo.dfa( 'CSV', initial=ABP, context='csv' )

    # A regular expression; he default dfa name is the regular expression itself.
    REGEX		= cpppo.regex( initial='(ab+)((,[ ]*)(ab+))*', context='regex' )

    data		= cpppo.dotdict()
    for machine in [ BASIC, CSV, REGEX ]:
        path		= machine.context() + '.input' # default for state_input data
        source		= cpppo.peekable( str( 'abbbb, ab' ))
        with machine:
            for i,(m,s) in enumerate( machine.run( source=source, data=data )):
                print( "%s #%3d; next byte %3d: %-10.10r: %r" % (
                       m.name_centered(), i, source.sent, source.peek(), data.get(path) ))
        print( "Accepted: %r; remaining: %r\n" % ( data.get(path), ''.join( source )))
    print( "Final: %r" % ( data ))
Esempio n. 3
0
def test_regex_demo():
    regex			= str( '(ab+)((,[ ]*)(ab+))*' )
    machine			= cpppo.regex( name=str( 'demo' ), initial=regex )
    data			= cpppo.dotdict()
    with machine:
        source			= cpppo.chainable( str( 'abbb, abb, ab' ))
        for i,(m,s) in enumerate( machine.run( source=source, data=data )):
            log.info( "%s #%3d -> %10.10s; next byte %3d: %-10.10r: %r", m.name_centered(),
                      i, s, source.sent, source.peek(), data )
        assert i == 14
        assert source.sent == 13

    regexstr, lego, machine, initial = cpppo.state_input.from_regex(
            regex, alphabet=cpppo.type_str_iter, encoder=None, 
            typecode=cpppo.type_str_array_symbol, context=None )
    assert str( lego ) == "ab+(, *ab+)*"
    assert str( machine ) == """\
Esempio n. 4
0
def main():
    """The basic examples in the README"""

    # Basic DFA that accepts ab+
    E = cpppo.state('E')
    A = cpppo.state_input('A')
    B = cpppo.state_input('B', terminal=True)
    E['a'] = A
    A['b'] = B
    B['b'] = B

    BASIC = cpppo.dfa('ab+', initial=E, context='basic')

    # Composite state machine accepting ab+, ignoring ,[ ]* separators
    ABP = cpppo.dfa('ab+', initial=E, terminal=True)
    SEP = cpppo.state_drop('SEP')
    ABP[','] = SEP
    SEP[' '] = SEP
    SEP[None] = ABP

    CSV = cpppo.dfa('CSV', initial=ABP, context='csv')

    # A regular expression; he default dfa name is the regular expression itself.
    REGEX = cpppo.regex(initial='(ab+)((,[ ]*)(ab+))*', context='regex')

    data = cpppo.dotdict()
    for machine in [BASIC, CSV, REGEX]:
        path = machine.context() + '.input'  # default for state_input data
        source = cpppo.peekable(str('abbbb, ab'))
        with machine:
            for i, (m, s) in enumerate(machine.run(source=source, data=data)):
                print("%s #%3d; next byte %3d: %-10.10r: %r" %
                      (m.name_centered(), i, source.sent, source.peek(),
                       data.get(path)))
        print("Accepted: %r; remaining: %r\n" %
              (data.get(path), ''.join(source)))
    print("Final: %r" % (data))
Esempio n. 5
0
def test_codecs():
    # In Python3, the greenery.fsm is able to handle the Unicode str type; under
    # Python2, it can sanely only handle the non-Unicode str type.
    if sys.version_info[0] < 3:
        return

    # Test parsing of greenery.fsm/lego regexes specified in Unicode.  Then,
    # generate corresponding cpppo state machines that accept Unicode input
    # symbols, and byte input symbols.  These tests will accept as much of the
    # input as matches the regular expression.

    texts = [
        'pi: π',
        'abcdé\u4500123',
        'This contains π,π and more πs',
        'a 480Ω resistor',
    ]
    tests = [
        (
            '[^π]*(π[^π]*)+', True
        ),  # Optional non-π's, followed by at least one string of π and non-π's
        ('[^π]*[^π]', False)  # Any number of non-π, ending in a non-π
    ]

    for text in texts:
        for re, tr in tests:
            # First, convert the unicode regex to a state machine in unicode symbols.  Only if both
            # the dfa and its sub-state are "terminal", will it be terminal.
            with cpppo.regex(name='pies',
                             context="pies",
                             initial=re,
                             terminal=True) as pies:
                original = text
                source = cpppo.chainable(original)
                data = cpppo.dotdict()
                try:
                    for mch, sta in pies.run(source=source, data=data):
                        pass
                except cpppo.NonTerminal:
                    pass
                accepted = pies.terminal and data.pies.input.tounicode(
                ) == original
                log.info("%s ends w/ re %s: %s: %r", pies.name_centered(), re,
                         "string accepted" if accepted else "string rejected",
                         data)

                # Each of these are greedy, and so run 'til the end of input (next state is None); they
                # collect the full input string, unless they run into a non-matching input.
                expected = tr == ('π' in text)
                assert accepted == expected

    for text in texts:
        # Then convert the unicode regex to a state machine in bytes symbols.
        # Our encoder generates 1 or more bytes for each unicode symbol.
        for re, tr in tests:
            original = text.encode('utf-8')  # u'...' --> b'...'
            source = cpppo.chainable(original)
            data = cpppo.dotdict()

            with cpppo.regex(name='pies',
                             context="pies",
                             initial=re,
                             terminal=True,
                             regex_alphabet=int,
                             regex_typecode='B',
                             regex_encoder=lambda s:
                             (b for b in s.encode('utf-8'))) as pies:
                try:
                    for mch, sta in pies.run(source=source, data=data):
                        pass
                except cpppo.NonTerminal:
                    pass
                accepted = pies.terminal and data.pies.input.tobytes(
                ) == original
                log.detail(
                    "%s ends w/ re: %s: %s: %r", pies.name_centered(), re,
                    "string accepted" if accepted else "string rejected", data)
                expected = tr == ('π' in text)
                assert accepted == expected
                assert original.startswith(data.pies.input.tobytes())
Esempio n. 6
0
def test_regex():
    # This forces plain strings in 2.x, unicode in 3.x (counteracts import unicode_literals above)
    regex = str('a*b.*x')
    machine = cpppo.regex(name=str('test1'), initial=regex)
    with machine:
        source = cpppo.chainable(str('aaab1230xoxx'))
        sequence = machine.run(source=source)
        for num in range(20):
            try:
                mch, sta = next(sequence)
                inp = source.peek()
            except StopIteration:
                inp = source.peek()
                log.info("%s <- %-10.10r test done",
                         cpppo.centeraxis(mch, 25, clip=True), inp)
                break
            log.info("%s <- %-10.10r test rcvd",
                     cpppo.centeraxis(mch, 25, clip=True), inp)
            if sta is None:
                log.info("%s <- %-10.10r test no next state",
                         cpppo.centeraxis(mch, 25, clip=True), inp)
            if inp is None:
                log.info("%s <- %-10.10r test source finished",
                         cpppo.centeraxis(mch, 25, clip=True), inp)

            # Initial state does *not* consume a source symbol
            if num == 0:
                assert inp == 'a'
                assert sta.name == "0'"
                assert source.sent == 0
            if num == 1:
                assert inp == 'a'
                assert sta.name == "0"
                assert source.sent == 0
            if num == 2:
                assert inp == 'a'
                assert sta.name == "0"
                assert source.sent == 1
            if num == 3:
                assert inp == 'a'
                assert sta.name == "0"
                assert source.sent == 2
            if num == 4:
                assert inp == 'b'
                assert sta.name == "2"
            if num == 5:
                assert inp == '1'
                assert sta.name == "2"
            if num == 6:
                assert inp == '2'
                assert sta.name == "2"
            if num == 7:
                assert inp == '3'
                assert sta.name == "2"
            if num == 8:
                assert inp == '0'
                assert sta.name == "2"
            if num == 9:
                assert inp == 'x'
                assert sta.name == "3"
            if num == 10:
                assert inp == 'o'
                assert sta.name == "2"  # Trans. from term. to non-term. state!))
            if num == 11:
                assert inp == 'x'
                assert sta.name == "3"
            if num == 12:
                assert inp == 'x'
                assert sta.name == "3"
            if num == 13:
                assert inp == None
                assert sta is None
            assert num < 14
        assert inp is None
        assert num == 14
        assert sta is None and machine.current.name == '3'

    regex = str('.*')
    machine = cpppo.regex(name=str('dot'), initial=regex, terminal=True)
    data = cpppo.dotdict()
    with machine:
        source = cpppo.chainable(str('aaab1230xoxx\0'))
        try:
            for i, (m, s) in enumerate(machine.run(source=source, data=data)):
                log.info("%s #%3d -> %10.10s; next byte %3d: %-10.10r: %r",
                         m.name_centered(), i, s, source.sent, source.peek(),
                         data)
        except cpppo.NonTerminal:
            pass
        assert machine.terminal
        assert i == 14
        assert source.sent == 13
        if sys.version_info[0] < 3:
            assert data.input.input.tostring() == 'aaab1230xoxx\x00'
        else:
            assert data.input.input.tounicode() == 'aaab1230xoxx\x00'

    regex = str('[^xyz]*')
    machine = cpppo.regex(name=str('not_xyz'), initial=regex)
    data = cpppo.dotdict()
    with machine:
        source = cpppo.chainable(str('aaab1230xoxx\0'))
        try:
            for i, (m, s) in enumerate(machine.run(source=source, data=data)):
                log.info("%s #%3d -> %10.10s; next byte %3d: %-10.10r: %r",
                         m.name_centered(), i, s, source.sent, source.peek(),
                         data)
        except cpppo.NonTerminal:
            pass
        assert not machine.terminal
        assert i == 9
        assert source.sent == 8
        if sys.version_info[0] < 3:
            assert data.input.input.tostring() == 'aaab1230'
        else:
            assert data.input.input.tounicode() == 'aaab1230'

    regex = str('[^\x00]*')
    machine = cpppo.regex(name=str('not_NUL'), initial=regex)
    data = cpppo.dotdict()
    with machine:
        source = cpppo.chainable(str('aaab1230xoxx\0'))
        for i, (m, s) in enumerate(machine.run(source=source, data=data)):
            log.info("%s #%3d -> %10.10s; next byte %3d: %-10.10r: %r",
                     m.name_centered(), i, s, source.sent, source.peek(), data)
        assert i == 13
        assert source.sent == 12
        if sys.version_info[0] < 3:
            assert data.input.input.tostring() == 'aaab1230xoxx'
        else:
            assert data.input.input.tounicode() == 'aaab1230xoxx'
Esempio n. 7
0
def test_codecs():
    # In Python3, the greenery.fsm is able to handle the Unicode str type; under
    # Python2, it can sanely only handle the non-Unicode str type.
    if sys.version_info[0] < 3:
        return

    # Test parsing of greenery.fsm/lego regexes specified in Unicode.  Then,
    # generate corresponding cpppo state machines that accept Unicode input
    # symbols, and byte input symbols.  These tests will accept as much of the
    # input as matches the regular expression.


    texts 			= [
        'pi: π',
        'abcdé\u4500123',
        'This contains π,π and more πs',
        'a 480Ω resistor',
        ]
    tests			= [
        ('[^π]*(π[^π]*)+',	True),	# Optional non-π's, followed by at least one string of π and non-π's
        ('[^π]*[^π]',		False) 	# Any number of non-π, ending in a non-π
        ]

    for text in texts:
        for re,tr in tests:
            # First, convert the unicode regex to a state machine in unicode symbols.  Only if both
            # the dfa and its sub-state are "terminal", will it be terminal.
            with cpppo.regex(
                    name='pies',  context="pies", initial=re, terminal=True ) as pies:
                original		= text
                source			= cpppo.chainable( original )
                data			= cpppo.dotdict()
                try:
                    for mch, sta in pies.run( source=source, data=data ):
                        pass
                except cpppo.NonTerminal:
                    pass
                accepted		= pies.terminal and data.pies.input.tounicode() == original
                log.info( "%s ends w/ re %s: %s: %r", pies.name_centered(), re,
                          "string accepted" if accepted else "string rejected", data )
            
                # Each of these are greedy, and so run 'til the end of input (next state is None); they
                # collect the full input string, unless they run into a non-matching input.
                expected		= tr == ('π' in text )
                assert accepted == expected

    for text in texts:
        # Then convert the unicode regex to a state machine in bytes symbols.
        # Our encoder generates 1 or more bytes for each unicode symbol.
        for re,tr in tests:
            original		= text.encode( 'utf-8' ) # u'...' --> b'...'
            source		= cpppo.chainable( original )
            data		= cpppo.dotdict()

            with cpppo.regex(
                    name='pies', context="pies", initial=re, terminal=True,
                    regex_alphabet=int,
                    regex_typecode='B',
                    regex_encoder=lambda s: ( b for b in s.encode( 'utf-8' ))) as pies:
                try:
                    for mch, sta in pies.run( source=source, data=data ):
                        pass
                except cpppo.NonTerminal:
                    pass
                accepted		= pies.terminal and data.pies.input.tobytes() == original
                log.detail( "%s ends w/ re: %s: %s: %r", pies.name_centered(), re,
                          "string accepted" if accepted else "string rejected", data )
                expected		= tr == ('π' in text )
                assert accepted == expected
                assert original.startswith( data.pies.input.tobytes() )
Esempio n. 8
0
def test_regex():
    # This forces plain strings in 2.x, unicode in 3.x (counteracts import unicode_literals above)
    regex			= str('a*b.*x')
    machine			= cpppo.regex( name=str('test1'), initial=regex )
    with machine:
        source			= cpppo.chainable( str('aaab1230xoxx') )
        sequence		= machine.run( source=source )
        for num in range( 20 ):
            try:
                mch,sta		= next( sequence )
                inp		= source.peek()
            except StopIteration:
                inp		= source.peek()
                log.info( "%s <- %-10.10r test done", cpppo.centeraxis( mch, 25, clip=True ), inp )
                break
            log.info( "%s <- %-10.10r test rcvd", cpppo.centeraxis( mch, 25, clip=True ), inp )
            if sta is None:
                log.info( "%s <- %-10.10r test no next state", cpppo.centeraxis( mch, 25, clip=True ), inp )
            if inp is None:
                log.info( "%s <- %-10.10r test source finished", cpppo.centeraxis( mch, 25, clip=True ), inp )
    
            # Initial state does *not* consume a source symbol
            if num == 0: assert inp == 'a'; assert sta.name == "0'"; assert source.sent == 0
            if num == 1: assert inp == 'a'; assert sta.name == "0";  assert source.sent == 0
            if num == 2: assert inp == 'a'; assert sta.name == "0";  assert source.sent == 1
            if num == 3: assert inp == 'a'; assert sta.name == "0";  assert source.sent == 2
            if num == 4: assert inp == 'b'; assert sta.name == "2"
            if num == 5: assert inp == '1'; assert sta.name == "2"
            if num == 6: assert inp == '2'; assert sta.name == "2"
            if num == 7: assert inp == '3'; assert sta.name == "2"
            if num == 8: assert inp == '0'; assert sta.name == "2"
            if num == 9: assert inp == 'x'; assert sta.name == "3"
            if num ==10: assert inp == 'o'; assert sta.name == "2" # Trans. from term. to non-term. state!))
            if num ==11: assert inp == 'x'; assert sta.name == "3"
            if num ==12: assert inp == 'x'; assert sta.name == "3"
            if num ==13: assert inp ==None; assert sta is None
            assert num < 14
        assert inp is None
        assert num == 14
        assert sta is None and machine.current.name == '3'

    regex			= str('.*')
    machine			= cpppo.regex( name=str('dot'), initial=regex, terminal=True )
    data			= cpppo.dotdict()
    with machine:
        source			= cpppo.chainable( str('aaab1230xoxx\0') )
        try:
            for i,(m,s) in enumerate( machine.run( source=source, data=data )):
                log.info( "%s #%3d -> %10.10s; next byte %3d: %-10.10r: %r", m.name_centered(),
                          i, s, source.sent, source.peek(), data )
        except cpppo.NonTerminal:
            pass
        assert machine.terminal
        assert i == 14
        assert source.sent == 13
        if sys.version_info[0] < 3:
            assert data.input.input.tostring()  == 'aaab1230xoxx\x00'
        else:
            assert data.input.input.tounicode() == 'aaab1230xoxx\x00'

    regex			= str('[^xyz]*')
    machine			= cpppo.regex( name=str('not_xyz'), initial=regex )
    data			= cpppo.dotdict()
    with machine:
        source			= cpppo.chainable( str('aaab1230xoxx\0') )
        try:
            for i,(m,s) in enumerate( machine.run( source=source, data=data )):
                log.info( "%s #%3d -> %10.10s; next byte %3d: %-10.10r: %r", m.name_centered(),
                          i, s, source.sent, source.peek(), data )
        except cpppo.NonTerminal:
            pass
        assert not machine.terminal
        assert i == 9
        assert source.sent == 8
        if sys.version_info[0] < 3:
            assert data.input.input.tostring()  == 'aaab1230'
        else:
            assert data.input.input.tounicode() == 'aaab1230'

    regex			= str('[^\x00]*')
    machine			= cpppo.regex( name=str('not_NUL'), initial=regex )
    data			= cpppo.dotdict()
    with machine:
        source			= cpppo.chainable( str('aaab1230xoxx\0') )
        for i,(m,s) in enumerate( machine.run( source=source, data=data )):
            log.info( "%s #%3d -> %10.10s; next byte %3d: %-10.10r: %r", m.name_centered(),
                      i, s, source.sent, source.peek(), data )
        assert i == 13
        assert source.sent == 12
        if sys.version_info[0] < 3:
            assert data.input.input.tostring()  == 'aaab1230xoxx'
        else:
            assert data.input.input.tounicode() == 'aaab1230xoxx'
Esempio n. 9
0
def test_codecs():
    # In Python3, the greenery.fsm is able to handle the Unicode str type; under
    # Python2, it can sanely only handle the non-Unicode str type.
    if sys.version_info.major < 3:
        return

    # Test parsing of greenery.fsm/lego regexes specified in Unicode.  Then,
    # generate corresponding cpppo state machines that accept Unicode input
    # symbols, and byte input symbols.

    texts = [
        'pi: π',
        'abcdé\u4500123',
        'This contains π,π and more πs',
        'a 480Ω resistor',
    ]

    for text in texts:
        # First, convert the unicode regex to a state machine in unicode symbols.
        with cpppo.regex(name='pies',
                         context="pies",
                         initial='.*π.*',
                         terminal=True) as pies:
            source = cpppo.chainable(text)
            data = cpppo.dotdict()
            try:
                for mch, sta in pies.run(source=source, data=data):
                    pass
            except cpppo.NonTerminal:
                pass

            log.info("%s ends: %s: %r", pies.name_centered(),
                     "string accepted" if pies.terminal else "string rejected",
                     data)

            # Each of these are greedy, and so run 'til the end of input (next state
            # is None); they collect the full input string.
            assert pies.terminal == ('π' in text)
            assert data.pies.input.tounicode() == text

    for text in texts:
        # Then convert the unicode regex to a state machine in bytes symbols.
        # Our encoder generates 1 or more bytes for each unicode symbol.

        pies = cpppo.regex(name='pies',
                           context="pies",
                           initial='.*π.*',
                           terminal=True,
                           regex_alphabet=int,
                           regex_typecode='B',
                           regex_encoder=lambda s:
                           (b for b in s.encode('utf-8')))

        source = cpppo.chainable(text.encode('utf-8'))
        data = cpppo.dotdict()

        with pies:
            try:
                for mch, sta in pies.run(source=source, data=data):
                    pass
            except cpppo.NonTerminal:
                pass

            log.info("%s ends: %s: %r", pies.name_centered(),
                     "string accepted" if pies.terminal else "string rejected",
                     data)

            assert pies.terminal == ('π' in text)
            assert data.pies.input.tobytes().decode('utf-8') == text
Esempio n. 10
0
def test_regex():
    # This forces plain strings in 2.x, unicode in 3.x (counteracts import unicode_literals above)
    regex = str('a*b.*x')
    machine = cpppo.regex(name=str('test1'), initial=regex)
    with machine:
        source = cpppo.chainable(str('aaab1230xoxx'))
        sequence = machine.run(source=source)
        for num in range(20):
            try:
                mch, sta = next(sequence)
                inp = source.peek()
            except StopIteration:
                inp = source.peek()
                log.info("%s <- %-10.10r test done",
                         cpppo.centeraxis(mch, 25, clip=True), inp)
                break
            log.info("%s <- %-10.10r test rcvd",
                     cpppo.centeraxis(mch, 25, clip=True), inp)
            if sta is None:
                log.info("%s <- %-10.10r test no next state",
                         cpppo.centeraxis(mch, 25, clip=True), inp)
            if inp is None:
                log.info("%s <- %-10.10r test source finished",
                         cpppo.centeraxis(mch, 25, clip=True), inp)

            # Initial state does *not* consume a source symbol
            if num == 0:
                assert inp == 'a'
                assert sta.name == "0'"
                assert source.sent == 0
            if num == 1:
                assert inp == 'a'
                assert sta.name == "0"
                assert source.sent == 0
            if num == 2:
                assert inp == 'a'
                assert sta.name == "0"
                assert source.sent == 1
            if num == 3:
                assert inp == 'a'
                assert sta.name == "0"
                assert source.sent == 2
            if num == 4:
                assert inp == 'b'
                assert sta.name == "2"
            if num == 5:
                assert inp == '1'
                assert sta.name == "2"
            if num == 6:
                assert inp == '2'
                assert sta.name == "2"
            if num == 7:
                assert inp == '3'
                assert sta.name == "2"
            if num == 8:
                assert inp == '0'
                assert sta.name == "2"
            if num == 9:
                assert inp == 'x'
                assert sta.name == "3"
            if num == 10:
                assert inp == 'o'
                assert sta.name == "2"  # Trans. from term. to non-term. state!))
            if num == 11:
                assert inp == 'x'
                assert sta.name == "3"
            if num == 12:
                assert inp == 'x'
                assert sta.name == "3"
            if num == 13:
                assert inp == None
                assert sta is None
            assert num < 14
        assert inp is None
        assert num == 14
        assert sta is None and machine.current.name == '3'
Esempio n. 11
0
def test_codecs():
    # In Python3, the greenery.fsm is able to handle the Unicode str type; under
    # Python2, it can sanely only handle the non-Unicode str type.
    if sys.version_info.major < 3:
        return

    # Test parsing of greenery.fsm/lego regexes specified in Unicode.  Then,
    # generate corresponding cpppo state machines that accept Unicode input
    # symbols, and byte input symbols.

    texts = ["pi: π", "abcdé\u4500123", "This contains π,π and more πs", "a 480Ω resistor"]

    for text in texts:
        # First, convert the unicode regex to a state machine in unicode symbols.
        with cpppo.regex(name="pies", context="pies", initial=".*π.*", terminal=True) as pies:
            source = cpppo.chainable(text)
            data = cpppo.dotdict()
            try:
                for mch, sta in pies.run(source=source, data=data):
                    pass
            except cpppo.NonTerminal:
                pass

            log.info(
                "%s ends: %s: %r", pies.name_centered(), "string accepted" if pies.terminal else "string rejected", data
            )

            # Each of these are greedy, and so run 'til the end of input (next state
            # is None); they collect the full input string.
            assert pies.terminal == ("π" in text)
            assert data.pies.input.tounicode() == text

    for text in texts:
        # Then convert the unicode regex to a state machine in bytes symbols.
        # Our encoder generates 1 or more bytes for each unicode symbol.

        pies = cpppo.regex(
            name="pies",
            context="pies",
            initial=".*π.*",
            terminal=True,
            regex_alphabet=int,
            regex_typecode="B",
            regex_encoder=lambda s: (b for b in s.encode("utf-8")),
        )

        source = cpppo.chainable(text.encode("utf-8"))
        data = cpppo.dotdict()

        with pies:
            try:
                for mch, sta in pies.run(source=source, data=data):
                    pass
            except cpppo.NonTerminal:
                pass

            log.info(
                "%s ends: %s: %r", pies.name_centered(), "string accepted" if pies.terminal else "string rejected", data
            )

            assert pies.terminal == ("π" in text)
            assert data.pies.input.tobytes().decode("utf-8") == text
Esempio n. 12
0
def test_regex():
    # This forces plain strings in 2.x, unicode in 3.x (counteracts import unicode_literals above)
    regex = str("a*b.*x")
    machine = cpppo.regex(name=str("test1"), initial=regex)
    with machine:
        source = cpppo.chainable(str("aaab1230xoxx"))
        sequence = machine.run(source=source)
        for num in range(20):
            try:
                mch, sta = next(sequence)
                inp = source.peek()
            except StopIteration:
                inp = source.peek()
                log.info("%s <- %-10.10r test done", cpppo.centeraxis(mch, 25, clip=True), inp)
                break
            log.info("%s <- %-10.10r test rcvd", cpppo.centeraxis(mch, 25, clip=True), inp)
            if sta is None:
                log.info("%s <- %-10.10r test no next state", cpppo.centeraxis(mch, 25, clip=True), inp)
            if inp is None:
                log.info("%s <- %-10.10r test source finished", cpppo.centeraxis(mch, 25, clip=True), inp)

            # Initial state does *not* consume a source symbol
            if num == 0:
                assert inp == "a"
                assert sta.name == "0'"
                assert source.sent == 0
            if num == 1:
                assert inp == "a"
                assert sta.name == "0"
                assert source.sent == 0
            if num == 2:
                assert inp == "a"
                assert sta.name == "0"
                assert source.sent == 1
            if num == 3:
                assert inp == "a"
                assert sta.name == "0"
                assert source.sent == 2
            if num == 4:
                assert inp == "b"
                assert sta.name == "2"
            if num == 5:
                assert inp == "1"
                assert sta.name == "2"
            if num == 6:
                assert inp == "2"
                assert sta.name == "2"
            if num == 7:
                assert inp == "3"
                assert sta.name == "2"
            if num == 8:
                assert inp == "0"
                assert sta.name == "2"
            if num == 9:
                assert inp == "x"
                assert sta.name == "3"
            if num == 10:
                assert inp == "o"
                assert sta.name == "2"  # Trans. from term. to non-term. state!))
            if num == 11:
                assert inp == "x"
                assert sta.name == "3"
            if num == 12:
                assert inp == "x"
                assert sta.name == "3"
            if num == 13:
                assert inp == None
                assert sta is None
            assert num < 14
        assert inp is None
        assert num == 14
        assert sta is None and machine.current.name == "3"