Esempio n. 1
0
def test_error_info():
    tokenize = make_tokenizer([
        Spec('keyword', r'(is|end)'),
        Spec('id', r'[a-z]+'),
        Spec('space', r'[ \t]+'),
        Spec('nl', r'[\n\r]+'),
    ])
    try:
        list(tokenize(u'f is ф'))
    except LexerError, e:
        eq_(unicode(e), u'1,6-1,6: cannot tokenize data: "f is \u0444"')
Esempio n. 2
0
def tokenize(str):
    'str -> Sequence(Token)'
    specs = [
        make_multiline_comment(r'/\*', r'\*/'),
        make_comment(r'//'),
        newline,
        space,
        Spec('name',    r'[A-Za-z\200-\377_][A-Za-z\200-\377_0-9]*'),
        Spec('op',      r'[{};,=\[\]]|(->)|(--)'),
        Spec('number',  r'-?(\.[0-9]+)|([0-9]+(\.[0-9]*)?)'),
        Spec('string',  r'"[^"]*"'), # '\"' escapes are ignored
    ]
    useless = ['comment', 'newline', 'space']
    t = make_tokenizer(specs)
    return [x for x in t(str) if x.type not in useless]
Esempio n. 3
0
def tokenize(str):
    'str -> Sequence(Token)'
    specs = [
        Spec('space', r'[ \t\r\n]+'),
        Spec('string', ur'"(%(unescaped)s | %(escaped)s)*"' % regexps,
             VERBOSE),
        Spec(
            'number', r'''
            -?                  # Minus
            (0|([1-9][0-9]*))   # Int
            (\.[0-9]+)?         # Frac
            ([Ee][+-][0-9]+)?   # Exp
            ''', VERBOSE),
        Spec('op', r'[{}\[\]\-,:]'),
        Spec('name', r'[A-Za-z_][A-Za-z_0-9]*'),
    ]
    useless = ['space']
    t = make_tokenizer(specs)
    return [x for x in t(str) if x.type not in useless]
Esempio n. 4
0
def tokenize(str):
    'str -> Sequence(Token)'
    specs = [
        Spec('space', r'[ \t\r\n]+'),
        Spec('string', ur'"(%(unescaped)s | %(escaped)s)*"' % regexps, VERBOSE),
        # NOTE: sometimes number gets into names place thus we shouldn't use them
        # TODO: consider removing or updating it
        # Spec('number', r'''
        #     -?                  # Minus
        #     (0|([1-9][0-9]*))   # Int
        #     (\.[0-9]+)?         # Frac
        #     ([Ee][+-][0-9]+)?   # Exp
        #     \b''', VERBOSE),
        Spec('op', r'[{}\(\),;=]'),
        Spec('comment', r'/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+/'),
        Spec('name', r'[/.A-Za-z_0-9]+'),
    ]
    useless = ['space']
    t = make_tokenizer(specs)
    return [x for x in t(str) if x.type not in useless]
Esempio n. 5
0
def test_error_info():
    tokenize = make_tokenizer([
        Spec('keyword', r'(is|end)'),
        Spec('id', r'[a-z]+'),
        Spec('space', r'[ \t]+'),
        Spec('nl', r'[\n\r]+'),
    ])
    try:
        list(tokenize('f is ф'))
    except LexerError as e:
        pass
    else:
        ok_(False, 'must raise LexerError')

    keyword = lambda s: tok('keyword', s)

    id = tok('id')
    is_ = keyword('is')
    end = keyword('end')
    nl = tok('nl')

    equality = id + skip(is_) + id >> tuple
    expr = equality + skip(nl)
    file = many(expr) + end

    msg = """\
rake is eggs
eggs isnt spam
end"""
    toks = [x for x in tokenize(msg) if x.type != 'space']
    try:
        file.parse(toks)
    except ParserError as e:
        msg, pos, i = e.args
        eq_(msg, "got unexpected token: id 'spam'")
        eq_(pos, ((2, 11), (2, 14)))
        # May raise KeyError
        t = toks[i]
        eq_(t, Token('id', 'spam'))
    else:
        ok_(False, 'must raise ParserError')
Esempio n. 6
0
def make_multiline_comment(open, close):
    return Spec('comment', r'%s(.|[\r\n])*?%s' % (open, close), MULTILINE)
Esempio n. 7
0
def make_comment(start):
    return Spec('comment', r'%s.*' % start)
Esempio n. 8
0
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

from re import MULTILINE
from funcparserlib.lexer import Spec

__all__ = ['make_comment', 'make_multiline_comment', 'newline', 'space']


# Comments
def make_comment(start):
    return Spec('comment', r'%s.*' % start)


def make_multiline_comment(open, close):
    return Spec('comment', r'%s(.|[\r\n])*?%s' % (open, close), MULTILINE)


# Common tokens
newline = Spec('newline', r'[\r\n]+')
space = Spec('space', r'[ \t\r\n]+')
Esempio n. 9
0
ENCODING = 'utf-8'
regexps = {
    'escaped':
    ur'''
        \\                                  # Escape
          ((?P<standard>["\\/bfnrt])        # Standard escapes
        | (u(?P<unicode>[0-9A-Fa-f]{4})))   # uXXXX
        ''',
    'unescaped':
    ur'''
        [\x20-\x21\x23-\x5b\x5d-\uffff]     # Unescaped: avoid ["\\]
        ''',
}

specs = [
    Spec('eol', r'[\r\n]+'),
    Spec('space', r'\s+'),
    Spec('string', ur'"(%(unescaped)s | %(escaped)s)*"' % regexps, VERBOSE),
    Spec('name', r'[A-Za-z_][A-Za-z_0-9]*'),
    Spec('class', r'\.[A-Za-z_][A-Za-z_0-9]*'),
    Spec('id', r'#[A-Za-z_][A-Za-z_0-9]*'),
    Spec('eq', r'='),
    Spec('>', '>'),
    Spec('<', '<'),
]
tokenizer = make_tokenizer(specs)


class Eol(object):
    def __init__(self, data):
        pass