def parse_obj(self, tokens): whitespace = regex(r'\s*') while_stmt = string('while') lexeme = lambda p: whitespace >> p << whitespace lbrace = lexeme(string('{')) rbrace = lexeme(string('}')) colon = lexeme(string(';')) cont = whitespace >> regex(r'\b(?:(?!while)\S)+\b') << whitespace content = (cont << colon | cont) while_obj = lbrace >> content.many() << rbrace while_ = while_stmt >> content.many() >> while_obj while_1 = whitespace >> while_ << whitespace full_make = (content.many() >> while_1 << content.many() | content.many() >> while_1) try: res = full_make.parse(tokens) return True except Exception as err: return err
def test_combine_dict(self): ddmmyyyy = seq( regex(r'[0-9]{2}').map(int).tag('day'), regex(r'[0-9]{2}').map(int).tag('month'), regex(r'[0-9]{4}').map(int).tag('year'), ).map(dict).combine_dict(date) self.assertEqual(ddmmyyyy.parse('05042003'), date(2003, 4, 5))
def parse_kind(name: str, model: Dict[str, Kind]) -> Kind: def kind_by_name(kind_name: str) -> Kind: if kind_name not in model: raise AttributeError(f"Property kind is not known: {kind_name}. Have you registered it?") return model[kind_name] simple_kind_parser = regex("[A-Za-z][A-Za-z0-9_.]*").map(kind_by_name) bracket_parser = string("[]") dict_string_parser = string("dictionary[") comma_parser = regex("\\s*,\\s*") bracket_r = string("]") @make_parser def array_parser() -> Parser: inner = yield dictionary_parser | simple_kind_parser brackets = yield bracket_parser.times(1, float("inf")) return ArrayKind.mk_array(inner, len(brackets)) @make_parser def dictionary_parser() -> Parser: yield dict_string_parser key_kind = cast(Kind, (yield simple_kind_parser)) yield comma_parser value_kind = yield array_parser | dictionary_parser | simple_kind_parser yield bracket_r return DictionaryKind(key_kind, value_kind) return (array_parser | dictionary_parser | simple_kind_parser).parse(name) # type: ignore
def __init__(self): spaces = regex(r'[ \t]*') # Excludes newline whitespace = regex(r'\s*') # Includes newline newline = string('\n') equal = string('=') lbrace = whitespace << string('{') << whitespace rbrace = whitespace << string('}') << whitespace # These parsers don't terminate blocks word = regex('[^\s=}]+') words = word + (spaces + word).many().concat() characters = regex(r'[^}]*') key_value_line = seq(spaces >> word << spaces << equal, spaces >> words << spaces) key_value_lines = key_value_line.sep_by(newline).map(dict) def block(name, content): return seq(whitespace >> name, lbrace >> content << rbrace) key_value_block = block(word, key_value_lines) key_value_blocks = key_value_block.many().map(dict_of_list) region_block = block(string('Region'), key_value_blocks) other_block = block(word, characters) self.parser = (region_block | other_block).many().map(dict)
def test_seq_kwargs(self): self.assertEqual( seq(first_name=regex(r"\S+") << whitespace, last_name=regex(r"\S+")).parse("Jane Smith"), { 'first_name': 'Jane', 'last_name': 'Smith' })
def test_combine_dict_list(self): Pair = namedtuple('Pair', ['word', 'number']) parser = seq( regex(r'[A-Z]+').tag('word'), regex(r'[0-9]+').map(int).tag('number'), ).combine_dict(Pair) self.assertEqual(parser.parse('ABC123'), Pair(word='ABC', number=123))
def lexeme(p): """ From a parser (or string), make a parser that consumes whitespace on either side. """ if isinstance(p, str): p = string(p) return regex(r'\s*') >> p << regex(r'\s*')
def whole_program(): """ parse whole program """ yield regex(r'\s*') atoms = yield atom.sep_by(regex(r'\s*')) yield regex(r'\s*') return atoms
def lexer(code): whitespace = regex(r'\s*') integer = digit.at_least(1).concat().map(int) float_ = (digit.many() + string('.').result(['.']) + digit.many()).concat().map(float) parser = whitespace >> ( (float_ | integer | regex(r'[()*/+-]')) << whitespace).many() return parser.parse(code)
def slist(): start_pos = yield parsy.line_info forward = yield parsy.regex(r'[([]') atoms = yield atom.sep_by(parsy.whitespace) if forward == '(': yield regex(r'\s*') >> parsy.string(')') else: yield regex(r'\s*') >> parsy.string(']') end_pos = yield parsy.line_info return RList(atoms, sq=forward == '[', span=to_range(start_pos, end_pos))
def word(): quote_char = yield regex('[\'"]').optional() if quote_char is None: return unquoted_word else: escape = regex(fr'\\[\\{quote_char}]').map(lambda x: x[-1]) value_char = escape | regex(fr'[^\\{quote_char}]+') word = yield value_char.many().concat() yield string(quote_char).desc("quote") return word
def line_of_code(): yield whitespace address = yield lexeme(number) yield colon byte_codes = yield lexeme((byte_code << parsy.string(' ')).times(1, max=4)) mnemonic = yield lexeme(parsy.regex(r'\w{2,5}')) op1 = yield lexeme(operand.optional()) yield lexeme(comma.optional()) op2 = yield lexeme(operand.optional()) yield lexeme(semicolon.optional()) comment = yield lexeme(parsy.regex(r'.+').optional()) return 'loc', address, byte_codes, mnemonic, op1, op2
def parseXYZContent(content): intParser = regex(r"[-+]?\d+").map(int) floatParser = regex(r"[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?").map(float) newLine = regex(r'\n') untilNewLine = regex(r'[^\n]*') @generate def parseHeader(): num = yield intParser # number of atoms yield newLine >> untilNewLine >> newLine # comment return num @generate def parseRow(): atom = yield regex(r'\s*[a-zA-Z0-9.]*') try: atomNumber = int(atom) atom = atom_names[atomNumber] except ValueError: pass yield whitespace.many() x = yield floatParser yield whitespace.many() y = yield floatParser yield whitespace.many() z = yield floatParser return np.array([x, y, z]), atom.strip() @generate def parseContent(): yield parseHeader.optional() table = yield parseRow.many() yield whitespace.many() atomCoords = [a[0] for a in table] atomNames = [a[1] for a in table] return np.array(atomCoords), np.array(atomNames) table, atomNames = parseContent.parse(content) latticeVectors = None if 'VEC1' in atomNames: # this is a crystal latticeVectors = [] for d in range(1, 4): vName = f'VEC{d}' assert vName in atomNames i = np.where(atomNames == vName)[0][0] latticeVectors.append(table[i]) table = np.delete(table, i, axis=0) atomNames = np.delete(atomNames, i, axis=0) latticeVectors = np.array(latticeVectors) atomNumbers = np.array([atom_proton_numbers[an] for an in atomNames]) return table, atomNumbers, atomNames, latticeVectors
def _nested() -> parsy.Parser: """ Self-referential recursion helper for `type_atom` (looks for further type defs nested between `[` `]` pairs) """ return ( yield between( parsy.regex(r"\[\s*"), parsy.regex(r",?\s*\]"), # allow line-breaks and trailing-comma type_atom.sep_by(parsy.regex(r",\s*")), # includes new-lines ) )
def description_parser(): point_join = lambda *args: '.'.join(args) identifier = regex(r'[0-9a-zA-Z\$_]+') class_name = string('L') >> identifier.sep_by( string('/')).combine(point_join) << string(';') base_type = regex('[BCDFIJSZ]').map(lambda i: BASE_TYPE_NAMES[i]) array = seq( string('[').at_least(1).map(lambda o: '[]' * len(o)), (base_type | class_name)).combine(lambda o, t: t + o) parameter = base_type | class_name | array parameters = parameter.many().combine( lambda *args: '(' + ','.join(args) + ')') void = string('V').map(lambda v: 'void') return_type = void | parameter return seq(string('(') >> parameters << string(')'), return_type)
def function(): address = yield lexeme(number) yield parsy.string('<') fn_name = yield parsy.regex(r'[_\w\d\.]+') yield lexeme(parsy.string('>')) yield lexeme(parsy.string(':')) return 'fn', address, fn_name
def test_regex(self): parser = regex(r'[0-9]') self.assertEqual(parser.parse('1'), '1') self.assertEqual(parser.parse('4'), '4') self.assertRaises(ParseError, parser.parse, 'x')
def test_regex_bytes(self): parser = regex(rb'[0-9]') self.assertEqual(parser.parse(b'1'), b'1') self.assertEqual(parser.parse(b'4'), b'4') self.assertRaises(ParseError, parser.parse, b'x')
def backtick(): """ Parse backticks. This is fugly. Backticks: I gave up on single-pass parsing here. It would be doable with the 'notes' extension, but would require enough context carrying forward that it'd need reimplementations of all bare string- and regex-matching things to understand how many levels deep they are. Here is the skinny: the shell has the $( ) which offer an objectively cleaner syntax. We parse backticks recursively because it's about the neatest approach to implement the shell spec that describes the feature in terms of a recursive implementation. The Posix shell spec, section 2.6.3, says this: Within the backquoted style of command substitution, <backslash> shall retain its literal meaning, except when followed by: '$', '`', or <backslash>. The search for the matching backquote shall be satisfied by the first unquoted non-escaped backquote; during this search, if a non-escaped backquote is encountered within a shell comment, a here-document, an embedded command substitution of the $(command) form, or a quoted string, undefined results occur. A single-quoted or double-quoted string that begins, but does not end, within the "`...`" sequence produces undefined results. What a mess. """ content = yield string("`") >> ( string("`").should_fail("backtick") >> (string(r"\`").result("`") | string(r"\$").result("$") | string(r"\\").result("\\") | regex(r'[^\\`]*') | string("\\"))).many().concat() << string("`") return command_sequence.parse(content)
def fields(): by_index = [] by_name = {} space = regex(r'\s+') @generate def field(): # Always try matching named fields before indexed fields. This is # necessary the parser for an indexed field will match the first half # of a named field but stop at the '='. The next parser will then fail # when it tries to continue from there. if by_name: yield named_field else: yield named_field | indexed_field @generate def named_field(): k, v = yield key_value by_name[k] = v @generate def indexed_field(): v = yield word by_index.append(v) yield field.sep_by(space, min=1) return Fields(by_index, by_name)
def test_space2(s0, s1): """ following non-whitespace is not consumed """ p = space() + parsy.regex(r'.*') val = p.parse(s0 + s1) assert val == s1
def number(): start_pos = yield parsy.line_info ret = yield regex(r'[+-]?\d+(\.\d+)?').map(map_number) end_pos = yield parsy.line_info ran = to_range(start_pos, end_pos) ret.range = ran return ret
def parse_header(name: str, clock: str) -> Tuple[str, Dict[str, str]]: in8 = string(' VL_IN8(').map(lambda x: 'IN8') in16 = string(' VL_IN16(').map(lambda x: 'IN16') in32 = string(' VL_IN(').map(lambda x: 'IN32') in64 = string(' VL_IN64(').map(lambda x: 'IN64') inw = string(' VL_INW(').map(lambda x: 'INW') out8 = string(' VL_OUT8(').map(lambda x: 'OUT8') out16 = string(' VL_OUT16(').map(lambda x: 'OUT16') out32 = string(' VL_OUT(').map(lambda x: 'OUT32') out64 = string(' VL_OUT64(').map(lambda x: 'OUT64') outw = string(' VL_OUTW(').map(lambda x: 'OUTW') ports = (in8 | in16 | in32 | in64 | inw | out8 | out16 | out32 | out64 | outw).desc('variable width definition') varname = regex('[a-za-z]+\w*').desc('variable name') with open(f'obj_dir/V{name}.h', 'r') as f: lines = f.readlines() portlist = {} for line in lines: try: port_def, _ = seq(ports, varname).parse_partial(line) portlist[port_def[1]] = port_def[0] except ParseError: pass # remove clock from port list del portlist[clock] return name, portlist
def expr_cont_quantified(): b = yield lexme(regex(r"@|!|\?!|\?|\\|lambda")) v = yield ident yield dot body = yield expr yield rparen return EQuantified(b, v, body)
def eol(): """ Parse and consume a single '\n' character. If there are any heredocs pending, immediately consume more lines of input until all heredocs are filled in. """ yield string("\n") # Do we need to consume some heredocs? notes = yield get_notes # make a copy of this list so that we don't perturb the note. hds = list(notes.get('hds', [])) while len(hds) > 0: # The next heredoc to scan for hd = hds.pop(0) lines = [] while True: line = yield eof.result(EOF) | regex( "[^\n]*\n") | regex("[^\n]*") << eof if line is EOF: return fail("looking for heredoc ending with " + hd.end) if line.rstrip("\n") == hd.end: break lines.append(line) content = '\n'.join(lines) if content == '': content = ConstantString("") elif hd.quote is None: content = double_content.parse(content) else: content = ConstantString(content) # Back-fill the HereDoc content. Note, this is *not* undone by backtracking. # However, a backtrack and re-parse may overwrite this value; so in the end, # it's likely that this will do what we want. hd.file = content # `notes` itself is a shallow copy, so we don't need to worry about copying it here. notes['hds'] = hds yield put_note(notes) return "\n"
def test_regex(): parser = regex(r'[0-9]') assert parser.parse('1') == '1' assert parser.parse('4') == '4' try: parser.parse('x'); assert False except ParseError: pass
def __init__(self): # TODO: finish this parser to it does more than just parse indi and GPO reponses indi_operator = string_from("indi") gp_operator = string_from("GPO", "GPI") space = string(" ") obj = regex(r"[a-zA-Z0-9.#]*") name = regex(r"[a-zA-Z]*") simple_string = regex(r"[a-zA-Z ]*") equals = string("=") value = regex(r"[^,]*") | regex(r'".*"') number = regex(r"[0-9]+") gp_value = regex(r"[hl]").map(lambda v: {"h": False, "l": True}[v]) * 5 indi_parser = seq(indi_operator << space).then( seq( path=obj << space.optional(), info=seq( name=name << equals, value=value << string(", ").optional()).map(lambda x: { x["name"]: x["value"] }).many().map( lambda kv: {k: v for d in kv for k, v in d.items()}), )) gp_parser = seq(gp_operator << space).then( seq(number=number << space, pins=gp_value)) error_parser = seq(string("ERROR") << space).then( seq(number << space, simple_string)) self.p = indi_parser | gp_parser | error_parser
def parseRow(): atom = yield parsy.regex(r'\s*[a-zA-Z.]*') yield parsy.whitespace.many() x = yield floatParser yield parsy.whitespace.many() y = yield floatParser yield parsy.whitespace.many() z = yield floatParser return {'atom':atom, 'x':x, 'y':y, 'z':z}
def parseRow(): atom = yield regex(r'\s*[a-zA-Z.]*') yield whitespace.many() x = yield floatParser yield whitespace.many() y = yield floatParser yield whitespace.many() z = yield floatParser return np.array([x, y, z]), atom.strip()
def test_nested_basic(): p = nested( parsy.string('('), parsy.string(')'), parsy.regex('[0-9]+').map(int), parsy.string(' '), ) result = p.parse("(0 1 (2 3) (4 5 6) 7 8)") assert result == [0, 1, [2, 3], [4, 5, 6], 7, 8]
from parsy import string, regex, generate import re import pdb whitespace = regex(r'\s+', re.MULTILINE) comment = regex(r';.*') ignore = (whitespace | comment).many() lexeme = lambda p: p << ignore lparen = lexeme(string('(')) rparen = lexeme(string(')')) number = lexeme(regex(r'\d+')).map(int) symbol = lexeme(regex(r'[\d\w_-]+')) true = lexeme(string('#t')).result(True) false = lexeme(string('#f')).result(False) atom = true | false | number | symbol @generate def form(): yield lparen els = yield expr.many() yield rparen return els @generate def quote(): yield string("'") e = yield expr return ['quote', e]
from parsy import string, regex, generate, ParseError import pdb letter = regex(r'[a-zA-Z]') digit = regex(r'[0-9]') def test_string(): parser = string('x') assert parser.parse('x') == 'x' try: parser.parse('y'); assert False except ParseError: pass def test_regex(): parser = regex(r'[0-9]') assert parser.parse('1') == '1' assert parser.parse('4') == '4' try: parser.parse('x'); assert False except ParseError: pass def test_then(): xy_parser = string('x') >> string('y') assert xy_parser.parse('xy') == 'y' try: xy_parser.parse('y'); assert False except ParseError: pass try: xy_parser.parse('z'); assert False except ParseError: pass
from parsy import string, regex, generate import re from sys import stdin whitespace = regex(r'\s*', re.MULTILINE) lexeme = lambda p: p << whitespace lbrace = lexeme(string('{')) rbrace = lexeme(string('}')) lbrack = lexeme(string('[')) rbrack = lexeme(string(']')) colon = lexeme(string(':')) comma = lexeme(string(',')) true = lexeme(string('true')).result(True) false = lexeme(string('false')).result(False) null = lexeme(string('null')).result(None) number = lexeme( regex(r'-?(0|[1-9][0-9]*)([.][0-9]+)?([eE][+-]?[0-9]+)?') ).map(float) string_part = regex(r'[^"\\]+') string_esc = string('\\') >> ( string('\\') | string('/') | string('b').result('\b') | string('f').result('\f') | string('n').result('\n') | string('r').result('\r') | string('t').result('\t')