def space( p_space=char.space, p_line_comment=parsy.fail('line-comment'), p_block_comment=parsy.fail('block-comment') ): """ Produces a parser that consumes white space in general. It's expected that you create such a parser once and pass it to other functions in this package as needed (when you see `p_space_consumer` in documentation, usually it means that something like `space()` is expected there). Args: p_space_chars: is used to parse blocks of space characters. You can use 'char.space' for this purpose, or your own parser (if you don't want to automatically consume newlines, for example). Make sure the parser does not succeed on empty input though. p_line_comment: is used to parse line comments. You can use 'megaparsy.skip_line_comment` if you don't need anything special. p_block_comment: is used to parse block (multi-line) comments. You can use `megaparsy.skip_block_comment` or `skip_block_comment_nested` if you don't need anything special. If you don't want to match a kind of comment, simply pass `parsy.fail()` and `space` will just move on or finish depending on whether there is more white space for it to consume. """ return parsy.success('')\ .skip(p_space.optional())\ .skip(p_line_comment.optional())\ .skip(p_block_comment.optional())
def parse_key_value(): k = yield parse_term if not is_ground(k): fail(f'{k} not ground') yield token(':') v = yield parse_term return k, v
def url(): yield iden_token_url yield open_bracket open_mark = yield quote_mark url = yield regex(r'[^\'\"()\s]+') close_mark = yield quote_mark if open_mark != close_mark: fail('shall enclose url into similar quotation marsk "" or ' '') yield close_bracket return url
def opt_container_impl(): o_first_if_node = yield jinja_if.skip(whitespace) o_tag_node = yield opening_tag.skip(whitespace) c_first_if_node = yield jinja_endif content_nodes = yield content o_second_if_node = yield jinja_if.skip(whitespace) if o_second_if_node.content != o_first_if_node.content: yield P.fail('expected `{% if ' + content + ' %}`') return html_tag_name = o_tag_node.name if isinstance(html_tag_name, str): closing_tag = make_closing_tag_parser(P.string(html_tag_name)) else: assert isinstance(html_tag_name, Jinja) closing_tag = make_closing_tag_parser(jinja) c_tag_node = yield closing_tag.skip(whitespace) c_second_if_node = yield jinja_endif return [ o_first_if_node, o_tag_node, c_first_if_node, content_nodes, o_second_if_node, c_tag_node, c_second_if_node, ]
def command(): words = [] assignments = [] redirs = [] assignments_possible = True while True: yield ws.optional() if assignments_possible: w = yield assignment | redirect | word else: w = yield redirect | word if isinstance(w, Word): assignments_possible = False elif isinstance(w, Redirect): redirs.append(w) continue elif isinstance(w, Assignment): assignments.append(w) continue if not w: break if len(words) == 0 and w.matches_reserved("while", "do", "done", "if", "then", "elif", "else", "fi", "case", "esac", "for"): return fail("can't have a reserved word here") words.append(w) cmd = Command(words).with_assignment(*assignments).with_redirect(*redirs) return cmd
def binop(): def prec_parse(lhs, lvl): @parsy.generate def helper(): lookahead = yield peek(operator) while lookahead and prec(lookahead) >= lvl: op = yield operator rhs = yield indexer | func | unop | paren( expr) | literal | variable lookahead = yield peek(operator) while lookahead and prec(lookahead) > prec(op): rhs = yield prec_parse(rhs, lvl=prec(lookahead)) lookahead = yield peek(operator) nonlocal lhs if op == '.': lhs = ast.dot(lhs, rhs) else: lhs = ast.func(op, lhs, rhs) return lhs return helper lhs = yield indexer | func | unop | paren(expr) | literal | variable if not (yield peek(operator)): return parsy.fail('binary operator') return (yield prec_parse(lhs, lvl=0))
def parameter_list(): comma = yield parsy.string(',').optional() if comma: parameters = yield parameter_format.many() if parameters: return ''.join(parameters) else: return parsy.fail('named parameters not supported yet')
def line(): kind = yield entry_kind binary = yield codepoint yield parsy.string('=') if kind == TableEntryKind.TEXT: entry = yield text_rhs elif kind == TableEntryKind.CONTROL: entry = yield control_rhs elif kind == TableEntryKind.END: entry = yield end_rhs elif kind == TableEntryKind.HOOK: entry = yield hook_rhs else: return parsy.fail('unsupported table entry kind') return (binary, entry)
def eol(): """ Parse and consume a single '\n' character. If there are any heredocs pending, immediately consume more lines of input until all heredocs are filled in. """ yield string("\n") # Do we need to consume some heredocs? notes = yield get_notes # make a copy of this list so that we don't perturb the note. hds = list(notes.get('hds', [])) while len(hds) > 0: # The next heredoc to scan for hd = hds.pop(0) lines = [] while True: line = yield eof.result(EOF) | regex( "[^\n]*\n") | regex("[^\n]*") << eof if line is EOF: return fail("looking for heredoc ending with " + hd.end) if line.rstrip("\n") == hd.end: break lines.append(line) content = '\n'.join(lines) if content == '': content = ConstantString("") elif hd.quote is None: content = double_content.parse(content) else: content = ConstantString(content) # Back-fill the HereDoc content. Note, this is *not* undone by backtracking. # However, a backtrack and re-parse may overwrite this value; so in the end, # it's likely that this will do what we want. hd.file = content # `notes` itself is a shallow copy, so we don't need to worry about copying it here. notes['hds'] = hds yield put_note(notes) return "\n"
def parser(): """ Returns: int: current indent level """ yield p_space_consumer _, actual = yield parsy.line_info if operator(actual, reference_level): return actual else: return parsy.fail( 'indent_guard: {actual} {op} {ref}'.format( actual=actual, op=OPERATOR_MAP.get(operator, operator.__name__), ref=reference_level, ) )
def command_sequence(): seq = [] while True: cmd = yield pipeline if cmd is not None: seq.append(cmd) else: break semi = yield (ws.optional() >> eos).optional() if semi is None: break if semi is EOF: break notes = yield get_notes # make a copy of this list so that we don't perturb the note. hds = list(notes.get('hds', [])) if len(hds) > 0: return fail("Want additional heredocs") return CommandSequence(seq)
def parser(): """ Returns: List[str] """ yield p_space_consumer _, pos = yield parsy.line_info done = yield (parsy.eof.result(True)).optional() if done: return [] else: if pos <= reference_level: return [] elif pos == next_level: current_val = yield p_indented_tokens more_vals = yield closure() return [current_val] + more_vals else: return parsy.fail( '_indented_items: {lvl} == {pos}'.format( lvl=next_level, pos=pos ) )
def extension(parsers: ParserDict) -> None: # This parses the top level of a file. # top level = # ENCODING, (word | statement | NEWLINE)*, [ NEWLINE ], # ENDMARKER ; @parsy.generate def top_level_parser() -> Generator[parsy.Parser, Any, TopLevelNode]: encoding = yield parsers.token('ENCODING') newline = parsers.token('NEWLINE') statement = parsers['statement'] word = parsers['word'] children = yield (word | statement | newline).many() children = [ child for child in children if not isinstance(child, concat.lex.Token) ] yield parsers.token('ENDMARKER') return TopLevelNode(encoding, children) parsers['top-level'] = desc_cumulatively(top_level_parser, 'top level') # This parses one of many types of statement. # The specific statement node is returned. # statement = import statement ; parsers['statement'] = parsers.ref_parser('import-statement') ImportStatementParserGenerator = Generator[ parsy.Parser, Any, ImportStatementNode ] # This parses one of many types of word. # The specific word node is returned. # word = # push word | literal word | name word | attribute word | quote word ; # literal word = number word | string word ; parsers['word'] = parsy.alt( parsers.ref_parser('push-word'), parsers.ref_parser('quote-word'), parsers.ref_parser('literal-word'), parsers.ref_parser('name-word'), parsers.ref_parser('attribute-word'), ) parsers['literal-word'] = parsers.ref_parser( 'number-word' ) | parsers.ref_parser('string-word') parsers['name-word'] = parsers.token('NAME').map(NameWordNode) parsers['number-word'] = parsers.token('NUMBER').map(NumberWordNode) parsers['string-word'] = parsers.token('STRING').map(StringWordNode) # This parses a quotation. # quote word = LPAR, word*, RPAR ; @parsy.generate('quote word') def quote_word_parser() -> Generator[parsy.Parser, Any, QuoteWordNode]: lpar = yield parsers.token('LPAR') if 'type-sequence' in parsers: input_stack_type_parser = parsers[ 'type-sequence' ] << parsers.token('COLON') input_stack_type = yield input_stack_type_parser.optional() else: input_stack_type = None children = yield parsers['word'].many() yield parsers.token('RPAR') return QuoteWordNode(children, lpar.start, input_stack_type) parsers['quote-word'] = quote_word_parser # This parses a push word into a node. # push word = DOLLARSIGN, word ; word = parsers.ref_parser('word') dollarSign = parsers.token('DOLLARSIGN') parsers['push-word'] = dollarSign >> word.map(PushWordNode) # Parsers an attribute word. # attribute word = DOT, NAME ; dot = parsers.token('DOT') name = parsers.token('NAME') parsers['attribute-word'] = dot >> name.map(AttributeWordNode) parsers['literal-word'] |= parsy.alt( parsers.ref_parser('none-word'), parsers.ref_parser('not-impl-word'), parsers.ref_parser('ellipsis-word'), parsers.ref_parser('bytes-word'), parsers.ref_parser('tuple-word'), parsers.ref_parser('list-word'), parsers.ref_parser('set-word'), parsers.ref_parser('dict-word'), ) # This parses a none word. # none word = NONE ; parsers['none-word'] = parsers.token('NONE').map(NoneWordNode) # This parses a not-impl word. # not-impl word = NOTIMPL ; parsers['not-impl-word'] = parsers.token('NOTIMPL').map(NotImplWordNode) # This parses an ellipsis word. # ellipsis word = ELLIPSIS ; parsers['ellipsis-word'] = parsers.token('ELLIPSIS').map(EllipsisWordNode) parsers['word'] |= parsy.alt( parsers.ref_parser('subscription-word'), parsers.ref_parser('slice-word'), parsers.ref_parser('operator-word'), parsers.ref_parser('yield-word'), parsers.ref_parser('await-word'), parsers.ref_parser('assert-word'), parsers.ref_parser('raise-word'), parsers.ref_parser('try-word'), parsers.ref_parser('with-word'), ) # This parses a subscription word. # subscription word = LSQB, word*, RSQB ; parsers['subscription-word'] = ( parsers.token('LSQB') >> parsers.ref_parser('word').many().map(SubscriptionWordNode) << parsers.token('RSQB') ) # This parses a slice word. # slice word = LSQB, word*, COLON, word*, [ COLON, word* ], RSQB ; @parsy.generate('slice word') def slice_word_parser(): yield parsers.token('LSQB') start = yield parsers.ref_parser('word').many() yield parsers.token('COLON') stop = yield parsers.ref_parser('word').many() none = concat.lex.Token() none.type = 'NONE' step = [NoneWordNode(none)] if (yield parsers.token('COLON').optional()): step = yield parsers['word'].many() yield parsers.token('RSQB') return SliceWordNode([start, stop, step]) parsers['slice-word'] = slice_word_parser parsers['operator-word'] = parsy.fail('operator') from concat.operators import operators for operator_name, token_type, node_type, _ in operators: parser_name = operator_name + '-word' parsers[parser_name] = parsers.token(token_type).map(node_type) parsers['operator-word'] |= parsers.ref_parser(parser_name) # This parses a bytes word. # bytes word = BYTES ; parsers['bytes-word'] = parsers.token('BYTES').map(BytesWordNode) def iterable_word_parser( delimiter: str, cls: Type[IterableWordNode], desc: str ) -> 'parsy.Parser[Token, IterableWordNode]': @parsy.generate def parser() -> Generator: location = (yield parsers.token('L' + delimiter)).start element_words = yield word_list_parser yield parsers.token('R' + delimiter) return cls(element_words, location) return concat.parser_combinators.desc_cumulatively(parser, desc) # This parses a tuple word. # tuple word = LPAR, word list, RPAR ; parsers['tuple-word'] = iterable_word_parser( 'PAR', TupleWordNode, 'tuple word' ) # This parses a list word. # list word = LSQB, word list, RSQB ; parsers['list-word'] = iterable_word_parser( 'SQB', ListWordNode, 'list word' ) # word list = (COMMA | word+, COMMA | word+, (COMMA, word+)+, [ COMMA ]) ; @parsy.generate('word list') def word_list_parser() -> Generator: empty: 'parsy.Parser[Token, List[Words]]' = parsers.token( 'COMMA' ).result([]) singleton = parsy.seq( parsers['word'].at_least(1) << parsers.token('COMMA') ) multiple_element = ( parsers['word'].at_least(1).sep_by(parsers.token('COMMA'), min=2) << parsers.token('COMMA').optional() ) element_words = yield (multiple_element | singleton | empty) return element_words # This parses a set word. # list word = LBRACE, word list, RBRACE ; parsers['set-word'] = iterable_word_parser( 'BRACE', SetWordNode, 'set word' ) # This parses a dict word. # dict word = # LBRACE, # [ key-value pair, (COMMA, key-value pair)* ], # [ COMMA ], # RBRACE ; # key-value pair = word*, COLON, word* ; @parsy.generate('dict word') def dict_word_parser() -> Generator: location = (yield parsers.token('LBRACE')).start elements = ( key_value_pair.sep_by(parsers.token('COMMA'), min=0) << parsers.token('COMMA').optional() ) element_words = yield elements yield parsers.token('RBRACE') return DictWordNode(element_words, location) parsers['dict-word'] = dict_word_parser key_value_pair = parsy.seq( parsers.ref_parser('word').many() << parsers.token('COLON'), parsers.ref_parser('word').many(), ) parsers['yield-word'] = parsers.token('YIELD').map(YieldWordNode) parsers['await-word'] = parsers.token('AWAIT').map(AwaitWordNode) parsers['assert-word'] = parsers.token('ASSERT').map(AssertWordNode) parsers['raise-word'] = parsers.token('RAISE').map(RaiseWordNode) parsers['try-word'] = parsers.token('TRY').map(TryWordNode) parsers['with-word'] = parsers.token('WITH').map(WithWordNode) parsers['statement'] |= parsy.alt( parsers.ref_parser('del-statement'), parsers.ref_parser('async-funcdef-statement'), parsers.ref_parser('classdef-statement'), parsers.ref_parser('funcdef-statement'), ) # Parsers a del statement. # del statement = DEL, target words ; # target words = target word, (COMMA, target word)*, [ COMMA ] ; # target word = name word # | LPAR, target words, RPAR # | LSQB, target words, RQSB # | attribute word # | subscription word # | slice word ; parsers['del-statement'] = parsers.token('DEL') >> parsers.ref_parser( 'target-words' ).map(DelStatementNode) from concat.astutils import flatten parsers['target-words'] = ( parsers.ref_parser('target-word').sep_by(parsers.token('COMMA'), min=1) << parsers.token('COMMA').optional() ).map(flatten) parsers['target-word'] = parsy.alt( parsers.ref_parser('name-word'), parsers.token('LPAR') >> parsers.ref_parser('target-words') << parsers.token('RPAR'), parsers.token('LSQB') >> parsers.ref_parser('target-words') << parsers.token('RSQB'), parsers.ref_parser('attribute-word'), parsers.ref_parser('subscription-word'), parsers.ref_parser('slice-word'), ) # This parses an async function definition. # async funcdef statement = ASYNC, funcdef statement ; @parsy.generate('async funcdef statement') def async_funcdef_statement_parser() -> Generator: location = (yield parsers.token('ASYNC')).start func: FuncdefStatementNode = (yield parsers['funcdef-statement']) name = concat.lex.Token() name.value = func.name return AsyncFuncdefStatementNode( name, func.decorators, func.annotation, func.body, location, func.stack_effect, ) parsers['async-funcdef-statement'] = async_funcdef_statement_parser # This parses a function definition. # funcdef statement = DEF, NAME, [ LPAR, stack effect, RPAR ], decorator*, # [ annotation ], COLON, suite ; # decorator = AT, word ; # annotation = RARROW, word* ; # suite = NEWLINE, INDENT, (word | statement, NEWLINE)+, DEDENT | statement # | word+ ; # The stack effect syntax is defined within the typecheck module. @parsy.generate def funcdef_statement_parser() -> Generator: location = (yield parsers.token('DEF')).start name = yield parsers.token('NAME') if (yield parsers.token('LPAR').optional()): effect_ast = yield parsers['stack-effect-type'] yield parsers.token('RPAR') else: effect_ast = None decorators = yield decorator.many() annotation = yield annotation_parser.optional() yield parsers.token('COLON') body = yield suite return FuncdefStatementNode( name, decorators, annotation, body, location, effect_ast ) parsers['funcdef-statement'] = concat.parser_combinators.desc_cumulatively( funcdef_statement_parser, 'funcdef statement' ) decorator = parsers.token('AT') >> parsers.ref_parser('word') annotation_parser = ( parsers.token('RARROW') >> parsers.ref_parser('word').many() ) @parsy.generate def suite(): words = parsers['word'].at_least(1) statement = parsy.seq(parsers['statement']) block_content = ( parsers['word'] << parsers.token('NEWLINE').optional() | parsers['statement'] << parsers.token('NEWLINE') ).at_least(1) indented_block = ( parsers.token('NEWLINE').optional() >> parsers.token('INDENT') >> block_content << parsers.token('DEDENT') ) return (yield indented_block | statement | words) suite = concat.parser_combinators.desc_cumulatively(suite, 'suite') @parsy.generate('module') def module(): name = parsers.token('NAME').map(operator.attrgetter('value')) return '.'.join((yield name.sep_by(parsers.token('DOT'), min=1))) # These following parsers parse import statements. # import statement = IMPORT, module, [ AS, NAME ] # | FROM, relative module, IMPORT, NAME, [ AS, NAME ] # | FROM, module, IMPORT, STAR; # module = NAME, (DOT, NAME)* ; # relative module = DOT*, module | DOT+ ; @parsy.generate('import statement') def import_statement_parser() -> Generator: location = (yield parsers.token('IMPORT')).start module_name = yield module asname_parser = parsers.token('NAME').map(operator.attrgetter('value')) asname = None if (yield parsers.token('AS').optional()): asname = yield asname_parser return ImportStatementNode(module_name, asname, location) parsers['import-statement'] = import_statement_parser @parsy.generate('relative module') def relative_module(): dot = parsers.token('DOT').map(operator.attrgetter('value')) return (yield (dot.many().concat() + module) | dot.at_least(1)) @parsy.generate('from-import statement') def from_import_statement_parser() -> Generator: location = (yield parsers.token('FROM')).start module = yield relative_module name_parser = parsers.token('NAME').map(operator.attrgetter('value')) imported_name = yield parsers.token('IMPORT') >> name_parser asname = None if (yield parsers.token('AS').optional()): asname = yield name_parser return FromImportStatementNode(module, imported_name, asname, location) parsers['import-statement'] |= from_import_statement_parser @parsy.generate('from-import-star statement') def from_import_star_statement_parser() -> Generator: location = (yield parsers.token('FROM')).start module_name = yield module yield parsers.token('IMPORT') yield parsers.token('STAR') return FromImportStarStatementNode(module_name, location) parsers['import-statement'] |= from_import_star_statement_parser # This parses a class definition statement. # classdef statement = CLASS, NAME, decorator*, [ bases ], keyword arg*, # COLON, suite ; # bases = tuple word ; # keyword arg = NAME, EQUAL, word ; @parsy.generate('classdef statement') def classdef_statement_parser(): location = (yield parsers.token('CLASS')).start name_token = yield parsers.token('NAME') decorators = yield decorator.many() bases_list = yield bases.optional() keyword_args = yield keyword_arg.map(tuple).many() yield parsers.token('COLON') body = yield suite return ClassdefStatementNode( name_token.value, body, location, decorators, bases_list, keyword_args, ) parsers['classdef-statement'] = classdef_statement_parser bases = parsers.ref_parser('tuple-word').map( operator.attrgetter('tuple_children') ) keyword_arg = parsy.seq( parsers.token('NAME').map(operator.attrgetter('value')) << parsers.token('EQUAL'), parsers.ref_parser('word'), ) parsers['word'] |= parsers.ref_parser('cast-word') @parsy.generate def cast_word_parser() -> Generator: location = (yield parsers.token('CAST')).start yield parsers.token('LPAR') type_ast = yield parsers['type'] yield parsers.token('RPAR') return CastWordNode(type_ast, location) # This parses a cast word. # none word = LPAR, type, RPAR, CAST ; # The grammar of 'type' is defined by the typechecker. parsers['cast-word'] = concat.parser_combinators.desc_cumulatively( cast_word_parser, 'cast word' )
def parser(): """ Returns: List[str] Raises: TypeError: if `p_reference` does not return one of IndentNone | IndentMany | IndentSome """ yield p_space_consumer _, ref_level = yield parsy.line_info indent_opt = yield p_reference if isinstance(indent_opt, IndentNone): # Parse no indented tokens, just return the value return p_space_consumer.result(indent_opt.val) elif isinstance(indent_opt, IndentMany): # Parse none-or-many indented tokens, use given indentation # level (if `None`, use level of the first indented token) maybe_indent, f, p = indent_opt p_indent_guard = indent_guard(p_space_consumer, operator.gt, ref_level) maybe_lvl = yield try_(char.eol >> p_indent_guard).optional() done = yield (parsy.eof.result(True)).optional() if not done and maybe_lvl is not None: next_level = from_maybe(maybe_lvl, maybe_indent) vals = yield _indented_items( ref_level, next_level, p_space_consumer, p ) return f(vals) else: return p_space_consumer.result(f([])) elif isinstance(indent_opt, IndentSome): # Just like `IndentMany`, but requires at least one indented token # to be present maybe_indent, f, p = indent_opt p_indent_guard = indent_guard(p_space_consumer, operator.gt, ref_level) pos = yield char.eol >> p_indent_guard lvl = from_maybe(pos, maybe_indent) if pos <= ref_level: parsy.fail( 'indent_block: {pos} > {ref}'.format( ref=ref_level, pos=pos, ) ) elif pos == lvl: current_val = yield p more_vals = yield _indented_items(ref_level, lvl, p_space_consumer, p) return f([current_val] + more_vals) else: parsy.fail( 'indent_block: {lvl} == {pos}'.format( lvl=lvl, pos=pos, ) ) else: raise TypeError('Must be one of IndentNone|IndentMany|IndentSome')
def binop(): op = yield ident if op not in e.bin_ops: yield fail("{} is not a binary operator".format(op)) else: return op
def python_identifier() -> parsy.Parser: name = yield regex(f"[{ID_START}][{ID_CONTINUE}]*") if name.isidentifier(): return name else: yield parsy.fail("Not a valid python identifier")
def identifier_impl(): lex = yield lexeme(P.regex(r"[_a-zA-Z][_'a-zA-Z0-9]*")) if lex in reserved: return P.fail("<not a reserved identifier>") return lex