def __init__(self): # TODO: finish this parser to it does more than just parse indi and GPO reponses indi_operator = string_from("indi") gp_operator = string_from("GPO", "GPI") space = string(" ") obj = regex(r"[a-zA-Z0-9.#]*") name = regex(r"[a-zA-Z]*") simple_string = regex(r"[a-zA-Z ]*") equals = string("=") value = regex(r"[^,]*") | regex(r'".*"') number = regex(r"[0-9]+") gp_value = regex(r"[hl]").map(lambda v: {"h": False, "l": True}[v]) * 5 indi_parser = seq(indi_operator << space).then( seq( path=obj << space.optional(), info=seq( name=name << equals, value=value << string(", ").optional()).map(lambda x: { x["name"]: x["value"] }).many().map( lambda kv: {k: v for d in kv for k, v in d.items()}), )) gp_parser = seq(gp_operator << space).then( seq(number=number << space, pins=gp_value)) error_parser = seq(string("ERROR") << space).then( seq(number << space, simple_string)) self.p = indi_parser | gp_parser | error_parser
def make_element_parser(config, content, jinja): container_element = make_container_element_parser( config, content=content, jinja=jinja, ) self_closing_element_opening_tag = make_opening_tag_parser( config, tag_name_parser=P.string_from(*SELF_CLOSING_ELEMENTS), allow_slash=True, jinja=jinja, ) self_closing_element = ( locate(P.seq( self_closing_element_opening_tag.skip(whitespace), P.success(None), # No content P.success(None), # No closing tag )) .combine(_combine_element) ) style = make_raw_text_element_parser(config, 'style', jinja=jinja) script = make_raw_text_element_parser(config, 'script', jinja=jinja) return style | script | self_closing_element | container_element
def test_string_from_transform(self): titles = string_from("Mr", "Mr.", "Mrs", "Mrs.", transform=lambda s: s.lower()) self.assertEqual(titles.parse("mr"), "Mr") self.assertEqual(titles.parse("mr."), "Mr.") self.assertEqual(titles.parse("MR"), "Mr") self.assertEqual(titles.parse("MR."), "Mr.")
def make_element_parser(config, content, jinja): container_element = make_container_element_parser( config, content=content, jinja=jinja ) void_element_opening_tag = make_opening_tag_parser( config, tag_name_parser=P.string_from(*VOID_ELEMENTS), allow_slash=True, jinja=jinja, ) void_element = locate( P.seq( void_element_opening_tag.skip(whitespace), P.success(None), # No content P.success(None), # No closing tag ) ).combine(_combine_element) svg_self_closing_tag = make_opening_tag_parser( config, tag_name_parser=P.string_from(*SVG_SELF_CLOSING_ELEMENTS), mandate_slash=True, jinja=jinja, ) svg_self_closing_element = locate( P.seq( svg_self_closing_tag.skip(whitespace), P.success(None), # No content P.success(None), # No closing tag ) ).combine(_combine_element) style = make_raw_text_element_parser(config, "style", jinja=jinja) script = make_raw_text_element_parser(config, "script", jinja=jinja) return ( style | script | void_element | svg_self_closing_element | container_element )
def word_variable_complex(): yield string("${") ref = yield variable_name.map(ConstantString).map(VarRef) op = yield string_from("##", "#", "%%", "%").optional() if op is not None: param = yield word ref = VarOp(ref, op, param) yield string("}") return ref
def test_string_from(self): titles = string_from("Mr", "Mr.", "Mrs", "Mrs.") self.assertEqual(titles.parse("Mr"), "Mr") self.assertEqual(titles.parse("Mr."), "Mr.") self.assertEqual((titles + string(" Hyde")).parse("Mr. Hyde"), "Mr. Hyde") with self.assertRaises(ParseError) as err: titles.parse('foo') ex = err.exception self.assertEqual(str(ex), """expected one of 'Mr', 'Mr.', 'Mrs', 'Mrs.' at 0:0""")
def passport(): _fields = {} while True: fkey, fval = yield field _fields[fkey] = fval m = yield string_from(" ", "\n").optional() if m == "\n": n = yield peek(string("\n").optional()) e = yield peek(eof.result(True).optional()) if n == "\n" or e: break elif m is None: break return _fields
CHIFFRES_ARABES = case_insensitive_string("1er").result("1") | regex(r"\d+") CHIFFRES_ROMAINS = case_insensitive_string("Ier") | regex(r"[IVXLCDM]+") LETTRES = regex(r"[A-Z]+") NUMERO = ( string("liminaire").result("0") | case_insensitive_string("premier").result("1") | CHIFFRES_ARABES | CHIFFRES_ROMAINS | LETTRES ) MULTIPLICATIF = string_from(*ADJECTIFS_MULTIPLICATIFS) ADDITIONNEL = regex(r"[A-Z]+") # alias "andouillette" (AAAAA) MULT_ADD = ( seq(MULTIPLICATIF.skip(whitespace), ADDITIONNEL).map(" ".join) | MULTIPLICATIF | ADDITIONNEL ) # Divisions uniques INTITULE = string_from( "Intitulé de la proposition de loi", "Intitulé du projet de loi",
'*': 7, '/': 7, '+': 6, '-': 6, '<': 5, '>': 5, '<=': 5, '>=': 5, '=': 4, '<>': 4, 'and': 2, 'or': 1, } prec = lambda op: op_prec_table[op] operator = lexeme(parsy.string_from(*op_prec_table)) unary_op_list = ['-', 'not'] unary_op = lexeme(parsy.string_from(*unary_op_list)) symbol_op_list = [ '[]', '.', '+', '-', '*', '/', '=', '<>', '>', '<', '>=', '<=' ] ascii_op_list = ['and', 'or', 'not'] # Parser helper combinators def peek(parser): @parsy.Parser def helper(stream, index): try:
""" Stripped down logo lexer, for tokenizing Turtle Logo programs like: fd 1 bk 2 rt 90 etc. """ from parsy import eof, regex, seq, string, string_from, whitespace command = string_from("fd", "bk", "rt", "lt") number = regex(r'[0-9]+').map(int) optional_whitespace = regex(r'\s*') eol = string("\n") line = seq(optional_whitespace >> command, whitespace >> number, (eof | eol | (whitespace >> eol)).result("\n")) flatten_list = lambda ls: sum(ls, []) lexer = line.many().map(flatten_list)
else: return (yield parser) return a alpha_numeric = p.regex('[a-zA-Z0-9_]') lower_alpha = p.regex('[a-z]') upper_alpha = p.regex('[A-Z]') lower_word = p.seq(lower_alpha, alpha_numeric.many().concat()).concat() upper_word = p.seq(upper_alpha, alpha_numeric.many().concat()).concat() integer = p.seq( p.string_from('+', '-').optional(), p.decimal_digit.at_least(1).concat().map(int)).combine( lambda sign, number: -number if sign == '-' else number) sq_char = p.regex('[a-zA-Z0-9 _\\-/~!@#$%^&*(),."]') single_quote = p.string("'") single_quoted = single_quote >> sq_char.at_least(1).concat() << single_quote dq_char = p.regex("[a-zA-Z0-9 _\\-/~!@#$%^&*(),.']") double_quote = p.string('"') double_quoted = double_quote >> dq_char.at_least(1).concat() << double_quote atomic_word = lower_word | single_quoted name = atomic_word | integer
"dark red bags contain 2 dark orange bags.", "dark orange bags contain 2 dark yellow bags.", "dark yellow bags contain 2 dark green bags.", "dark green bags contain 2 dark blue bags.", "dark blue bags contain 2 dark violet bags.", "dark violet bags contain no other bags.", ]) # Parsing Combinators optional_whitespace = regex(r"\s*") word = regex(r"\w+") number = regex(r"\d+") color_att = word color = word bag_term = string_from("bag", "bags") bag = seq(color_att, whitespace >> color).map(tuple) << whitespace << bag_term no_content = string("no other bags").map(lambda x: {}) content = (seq(number.map(int), whitespace >> bag).map(lambda a: (a[1], a[0])).sep_by( string(", ")).map(dict)) bag_contents = no_content | content rule = seq(bag, whitespace >> string("contain") >> whitespace >> bag_contents) << string(".") rules = rule.sep_by(string("\n")).map(dict) << string("\n").optional() def find_containers(mapping, bag):
def case_insensitive_string_from(*expected_strings: str) -> Any: return string_from(*expected_strings, transform=lambda s: s.lower())
CHIFFRES_ARABES = case_insensitive_string_from( "1e", "1er", "1ère").result("1") | regex(r"\d+") CHIFFRES_ROMAINS = case_insensitive_string("Ier") | regex(r"[IVXLCDM]+") LETTRES_CAPITALES = regex(r"[A-Zİ]+").map( remove_accents) # allow dotted capital i NUMERO = (string("liminaire").result("0") | case_insensitive_string_from("premier", "unique").result("1") | string("PRÉLIMINAIRE") | CHIFFRES_ARABES | CHIFFRES_ROMAINS | LETTRES_CAPITALES) MULTIPLICATIF = string_from(*ADJECTIFS_MULTIPLICATIFS) ADDITIONNEL = LETTRES_CAPITALES # alias "andouillette" (AAAAA) MULT_ADD = (seq(MULTIPLICATIF << whitespace.optional(), ADDITIONNEL).map( " ".join) | MULTIPLICATIF | ADDITIONNEL) # Divisions uniques INTITULE = ((case_insensitive_string("Intitulé") >> whitespace >> case_insensitive_string_from("de la", "de la", "du") >> whitespace).optional() >> case_insensitive_string_from( "proposition de loi", "projet de loi", "texte").result("titre") << regex(".*"))
decimal = regex(r'-?[0-9_]+(_[0-9]+)*d?').map( lambda s: int(re.sub(r'[_d]', '', s))) octal = regex(r'-?[0-7_]+(_[0-7]+)*o').map( lambda s: int(re.sub(r'[_o]', '', s), 8)) binary = regex(r'-?[0-1_]+(_[0-1]+)*b').map( lambda s: int(re.sub(r'[_b]', '', s), 2)) digit = binary | octal | decimal | hexadecimal identifier = regex(r'[a-z][a-z0-9]*([-_][a-z0-9]+)*') mnemonic = string_from("halt", "noop", "push", "load", "store", "xchg", "dup", "dupn", "swap", "pop", "jump", "link", "spawn", "add", "sub", "mul", "div", "mod", "neg", "not", "and", "or", "xor") condition = string_from("always", "overflow", "zero", "non-zero", "positive", "negative", "high", "safe") comment = (string(";") >> regex(r'[^\n\r]*') << (eol | eof)).map(comment) instruction = seq(condition=(condition << string(".")).optional(), mnemonic=mnemonic, operand=(ws1 >> (digit | identifier)).optional()).map(instruction) label = (identifier << string(":")).map(label) statement = label | instruction | comment
# We don't support ' in strings or escaping for simplicity string_literal = regex(r"'[^']*'").map(lambda s: String(s[1:-1])) identifier = regex('[a-zA-Z][a-zA-Z0-9_]*') field = identifier.map(Field) table = identifier.map(Table) space = regex(r'\s+') # non-optional whitespace padding = regex(r'\s*') # optional whitespace column_expr = field | string_literal | number_literal operator = string_from('=', '<', '>', '<=', '>=') comparison = seq( left=column_expr << padding, operator=operator, right=padding >> column_expr, ).combine_dict(Comparison) SELECT = string('SELECT') FROM = string('FROM') WHERE = string('WHERE') # Here we demonstrate use of leading underscore to discard parts we don't want, # which is more readable and convenient than `<<` and `>>` sometimes. select = seq( _select=SELECT + space,
} _essential_keys = { "byr", "iyr", "eyr", "hgt", "hcl", "ecl", "pid", #'cid', } optional_whitespace = regex(r"\s*") field_key = string_from(*passport_validation.keys()).desc("field key") field_value = regex(r"[A-Za-z0-9#]+").desc("field value") field = seq(field_key, string(":") >> field_value).map(tuple) passport_end = eof | string_from("\n", "\n\n") @generate def passport(): _fields = {} while True: fkey, fval = yield field _fields[fkey] = fval m = yield string_from(" ", "\n").optional() if m == "\n": n = yield peek(string("\n").optional())
notes = yield get_notes # make a copy of this list so that we don't perturb the note. hds = list(notes.get('hds', [])) if len(hds) > 0: return fail("Want additional heredocs") return CommandSequence(seq) eaten_newline = string("\\\n").result(Token("")) variable_id = regex("[a-zA-Z_][a-zA-Z0-9_]*") variable_name = regex("[1-9][0-9]*|[0\\?!#@\\*]") | variable_id word_id = regex('[^\\s\'()$=";|<>&\\\\{}`*]+').map(ConstantString) word_redir = string_from("<&", "<<", "<", ">&", ">>", ">").map(Token) word_single = ( string("'") >> regex("[^']*") << string("'")).map(ConstantString) word_expr = string("$(") >> command_sequence << string(")") word_backslash = string("\\") >> any_char.map(ConstantString) word_variable_reference = ( string("$") >> variable_name).map(ConstantString).map(VarRef) word_variable_name = variable_id.map(Id) word_equals = string("=").map(Token) word_dbrace = string("{}").map(Token) word_glob = string("**").result(STARSTAR) | string("*").result(STAR) e_id = variable_id @generate("word-variable-complex")
definition = f'{definition} = {self.default_value}' return definition C_COMMENT_PATTERN = re.compile(r'\/\/.*$|\/\*.*?\*\/', re.MULTILINE) EQ = parsy.char_from('=').desc('=') COMMA = parsy.char_from(',').desc(',') COLON = parsy.char_from(':').desc(':') SEMICOLON = parsy.char_from(';').desc(';') L_BRACE = parsy.char_from('{').desc('{') R_BRACE = parsy.char_from('}').desc('}') L_PARENTHESES = parsy.char_from('(').desc('(') R_PARENTHESES = parsy.char_from(')').desc(')') TYPE = parsy.string_from(*TYPES) WHITESPACE = parsy.whitespace.desc('whitespace') OPTIONAL_WHITESPACE = WHITESPACE.optional() SEMANTIC = parsy.string_from(*SEMANTICS) IDENTIFIER_CHARS = parsy.letter | parsy.decimal_digit | parsy.string("_") IDENTIFIER = (parsy.letter + IDENTIFIER_CHARS.many().concat()).desc('identifier') FLOAT = parsy.regex(r'[+-]?(?:\d+\.?\d*|\.\d+)').desc('float').map(Float) ARGS = FLOAT.sep_by( OPTIONAL_WHITESPACE >> parsy.string(',') << OPTIONAL_WHITESPACE, min=1) TYPE_CONSTRUCTOR = parsy.seq( TYPE << OPTIONAL_WHITESPACE << L_PARENTHESES, OPTIONAL_WHITESPACE >> ARGS << OPTIONAL_WHITESPACE << R_PARENTHESES).combine(TypeConstructor) DEFAULT_VALUE = FLOAT | TYPE_CONSTRUCTOR VARYING = ( parsy.seq(TYPE << WHITESPACE, IDENTIFIER << OPTIONAL_WHITESPACE,
comma = parsy.regex(r'\s*,\s*') semicolon = parsy.regex(r'\s*;\s*') intLit = parsy.regex(r'(0|[1-9][0-9]*)').map(int).desc("integer") floatLit = parsy.regex(r'-?(0|[1-9][0-9]*)(\.[0-9]+)?([eE][+-]?[0-9]+)?').map( float).desc("floating point number") singleQuoteString = parsy.regex(r"'[^']*'").map(lambda s: s[1:-1]) doubleQuoteString = parsy.regex(r'"[^"]*"').map(lambda s: s[1:-1]) strLit = (singleQuoteString | doubleQuoteString).desc("string") identifier = parsy.regex(r'[a-zA-Z][a-zA-Z0-9_]*').map(Identifier).desc( "identifier (variable)") operator = parsy.string_from('=', '<', '>', '<=', '>=') mapoper = parsy.string_from('<->') @parsy.generate def function(): fname = yield identifier yield oparen args = yield (mapping_binop | basic_expr.sep_by(comma)) yield cparen return Function(fname, args) function.desc("function call")