def get_bracket_parser(self): phrase = Delayed() label = Regexp(r"[^ \t\n\r\(\)]+") word = label > Node #terminal = Word() | ( Word() & Drop(Space()) & word ) terminal = label | (label & Drop(Space()) & word) with DroppedSpace(): phrase += Drop('(') & (terminal | label & phrase[1:] | phrase[1:]) & Drop(')') > Node return phrase
def test_safety(self): matcher3 = Delayed() matcher4 = Delayed() matcher1 = Any()[::'b', ...] & Eos() with Separator(Drop(Any('a')[:])): matcher2 = Any()[::'b', ...] & Eos() # pylint: disable-msg=W0613 def target(matcher3=matcher3, matcher4=matcher4): matcher3 += Any()[::'b', ...] & Eos() with Separator(Drop(Any('b')[:])): matcher4 += Any()[::'b', ...] & Eos() t = Thread(target=target) t.start() t.join() matcher5 = Any()[::'b', ...] & Eos() matcher6 = Any()[::'b', ...] & Eos() text = 'cababab' assert text == matcher1.parse_string(text)[0], matcher1.parse_string( text) assert 'cbbb' == matcher2.parse_string(text)[0], matcher2.parse_string( text) assert text == matcher3.parse_string(text)[0], matcher3.parse_string( text) assert 'caaa' == matcher4.parse_string(text)[0], matcher4.parse_string( text) assert 'cbbb' == matcher5.parse_string(text)[0], matcher5.parse_string( text) assert text == matcher6.parse_string(text)[0], matcher6.parse_string( text)
class ExtensionParser(object): """ A class that parses extensions. """ class ExtensionCall(Node): """ An extension call. """ _name = None _args = None _kwargs = None @property def name(self): return self._name[0] if self._name else None @property def args(self): return tuple(self._args) if self._args else tuple() @property def kwargs(self): return dict(self._kwargs) if self._kwargs else {} COMMA = Drop(',') NONE = Literal('None') >> (lambda x: None) BOOL = (Literal('True') | Literal('False')) >> (lambda x: x == 'True') IDENTIFIER = Word(Letter() | '_', Letter() | '_' | Digit()) FLOAT = Real() >> float INTEGER = Integer() >> int STRING = String() | String("'") ITEM = STRING | INTEGER | FLOAT | NONE | BOOL | IDENTIFIER with Separator(~Regexp(r'\s*')): VALUE = Delayed() LIST = Drop('[') & VALUE[:, COMMA] & Drop(']') > list TUPLE = Drop('(') & VALUE[:, COMMA] & Drop(')') > tuple VALUE += LIST | TUPLE | ITEM ARGUMENT = VALUE >> '_args' KWARGUMENT = (IDENTIFIER & Drop('=') & VALUE > tuple) >> '_kwargs' ARGUMENTS = (KWARGUMENT | ARGUMENT)[:, COMMA] NAME = IDENTIFIER > '_name' EXTENSION = ((NAME & Drop('(') & ARGUMENTS & Drop(')')) | NAME) & Eos() > ExtensionCall @property def parser(self): return self.EXTENSION.get_parse_string()
def test_node(self): class Term(Node): pass number = Any('1') > 'number' term = number > Term factor = term | Drop(Optional(term)) factor.config.clear().compose_transforms() p = factor.get_parse_string() ast = p('1')[0] assert type(ast) == Term, type(ast) assert ast[0] == '1', ast[0] assert str26(ast) == """Term `- number '1'""", ast
def create_parser(delimiter): space = Space() comma = Drop(',') | Drop(',') + space if delimiter == ',': # by comma seperator = Separator(~Regexp(r'\s*')) delimiter = comma else: assert delimiter == ' ', 'delimiter "%s" not supported' % delimiter seperator = DroppedSpace() delimiter = space none = Literal('None') >> (lambda x: None) bool = (Literal('True') | Literal('False')) >> (lambda x: x == 'True') ident = Word(Letter() | '_', Letter() | '_' | Digit()) float_ = Float() >> float int_ = Integer() >> int str_ = String() | String("'") dict_key = str_ | int_ | float_ | Word() dict_spaces = ~Whitespace()[:] dict_value = dict_key item = str_ | int_ | float_ | none | bool | ident | Word() with seperator: value = Delayed() list_ = Drop('[') & value[:, comma] & Drop(']') > list tuple_ = Drop('(') & value[:, comma] & Drop(')') > tuple dict_el = dict_key & Drop(':') & value > tuple dict_ = Drop('{') & dict_el[1:, Drop(',')] & Drop('}') > dict value += list_ | tuple_ | dict_ | item | space arg = value >> 'arg' karg = (ident & Drop('=') & value > tuple) >> 'karg' expr = (karg | arg)[:, delimiter] & Drop(Eos()) > Node return expr.get_parse()
def make_binary_parser(): ''' Create a parser for binary data. ''' # avoid import loops from lepl import Word, Letter, Digit, UnsignedInteger, \ Regexp, DfaRegexp, Drop, Separator, Delayed, Optional, Any, First, \ args, Trace, TraceVariables from lepl.bin.bits import BitString from lepl.support.node import Node classes = {} def named_class(name, *args): ''' Given a name and some args, create a sub-class of Binary and create an instance with the given content. ''' if name not in classes: classes[name] = type(name, (Node, ), {}) return classes[name](*args) with TraceVariables(False): mult = lambda l, n: BitString.from_sequence([l] * int(n, 0)) # an attribute or class name name = Word(Letter(), Letter() | Digit() | '_') # lengths can be integers (bits) or floats (bytes.bits) # but if we have a float, we do not want to parse as an int # (or we will get a conversion error due to too small length) length = First( UnsignedInteger() + '.' + Optional(UnsignedInteger()), UnsignedInteger()) # a literal decimal decimal = UnsignedInteger() # a binary number (without pre/postfix) binary = Any('01')[1:] # an octal number (without pre/postfix) octal = Any('01234567')[1:] # a hex number (without pre/postfix) hex_ = Regexp('[a-fA-F0-9]')[1:] # the letters used for binary, octal and hex values #(eg the 'x' in 0xffee) # pylint: disable-msg=C0103 b, o, x, d = Any('bB'), Any('oO'), Any('xX'), Any('dD') # a decimal with optional pre/postfix dec = '0' + d + decimal | decimal + d + '0' | decimal # little-endian literals have normal prefix syntax (eg 0xffee) little = decimal | '0' + (b + binary | o + octal | x + hex_) # big-endian literals have postfix (eg ffeex0) big = (binary + b | octal + o | hex_ + x) + '0' # optional spaces - will be ignored # (use DFA here because it's multi-line, so \n will match ok) spaces = Drop(DfaRegexp('[ \t\n\r]*')) with Separator(spaces): # the grammar is recursive - expressions can contain expressions - # so we use a delayed matcher here as a placeholder, so that we can # use them before they are defined. expr = Delayed() # an implicit length value can be big or little-endian ivalue = big | little > args(BitString.from_int) # a value with a length can also be decimal lvalue = (big | little | dec) & Drop('/') & length \ > args(BitString.from_int) value = lvalue | ivalue repeat = value & Drop('*') & little > args(mult) # a named value is also a tuple named = name & Drop('=') & (expr | value | repeat) > tuple # an entry in the expression could be any of these entry = named | value | repeat | expr # and an expression itself consists of a comma-separated list of # one or more entries, surrounded by paremtheses entries = Drop('(') & entry[1:, Drop(',')] & Drop(')') # the Binary node may be explicit or implicit and takes the list of # entries as an argument list node = Optional(Drop('Node')) & entries > Node # alternatively, we can give a name and create a named sub-class other = name & entries > args(named_class) # and finally, we "tie the knot" by giving a definition for the # delayed matcher we introduced earlier, which is either a binary # node or a subclass expr += spaces & (node | other) & spaces #expr = Trace(expr) # this changes order, making 0800x0 parse as binary expr.config.no_compile_to_regexp() # use sequence to force regexp over multiple lines return expr.get_parse_sequence()
# Assume the caller already that folding when parsing headers. # NOTE: qdtext also allows non-ascii, which we choose to parse # as ISO-8859-1; rejecting it entirely would also be permitted. # Some broken browsers attempt encoding-sniffing, which is broken # because the spec only allows iso, and because encoding-sniffing # can mangle valid values. # Everything else in this grammar (including RFC 5987 ext values) # is in an ascii-safe encoding. # Because of this, this is the only character class to use AnyBut, # and all the others are defined with Any. qdtext = AnyBut('"' + ctl_chars) char = Any(''.join(chr(i) for i in xrange(128))) # ascii range: 0-127 quoted_pair = Drop('\\') + char quoted_string = Drop('"') & (quoted_pair | qdtext)[:, ...] & Drop('"') value = token | quoted_string # Other charsets are forbidden, the spec reserves them # for future evolutions. charset = (CaseInsensitiveLiteral('UTF-8') | CaseInsensitiveLiteral('ISO-8859-1')) # XXX See RFC 5646 for the correct definition language = token attr_char = Any(attr_chars) hexdig = Any(hexdigits) pct_encoded = '%' + hexdig + hexdig
def target(matcher3=matcher3, matcher4=matcher4): matcher3 += Any()[::'b', ...] & Eos() with Separator(Drop(Any('b')[:])): matcher4 += Any()[::'b', ...] & Eos()