def _NonRangeChars(self, p_node): # type: (PNode) -> class_literal_term_t """ \" \u123 '#' """ assert p_node.typ == grammar_nt.range_char, p_node children = p_node.children typ = children[0].typ if ISNONTERMINAL(typ): p_child = children[0] if typ == grammar_nt.braced_var_sub: return cast(braced_var_sub, p_child.children[1].tok) if typ == grammar_nt.dq_string: return cast(double_quoted, p_child.children[1].tok) if typ == grammar_nt.sq_string: return cast(single_quoted, p_child.children[1].tok) if typ == grammar_nt.simple_var_sub: return simple_var_sub(children[0].tok) if typ == grammar_nt.char_literal: return class_literal_term.CharLiteral(children[0].tok) raise NotImplementedError() else: # Look up PerlClass and PosixClass return self._NameInClass(None, children[0].tok)
def _ClassLiteralTerm(self, p_node): # type: (PNode) -> class_literal_term_t """ class_literal_term: ( range_char ['-' range_char ] | '~' Expr_Name # $mychars or ${mymodule.mychars} | simple_var_sub | braced_var_sub # e.g. 'abc' or "abc$mychars" | dq_string ... """ assert p_node.typ == grammar_nt.class_literal_term, p_node children = p_node.children typ = children[0].typ if ISNONTERMINAL(typ): p_child = children[0] if typ == grammar_nt.simple_var_sub: return simple_var_sub(p_child.children[0].tok) if typ == grammar_nt.braced_var_sub: return cast(braced_var_sub, p_child.children[1].tok) if typ == grammar_nt.dq_string: return cast(double_quoted, p_child.children[1].tok) n = len(children) if n == 1 and typ == grammar_nt.range_char: return self._NonRangeChars(children[0]) # 'a'-'z' etc. if n == 3 and children[1].tok.id == Id.Arith_Minus: start = self._RangeChar(children[0]) end = self._RangeChar(children[2]) return class_literal_term.Range(start, end) else: if children[0].tok.id == Id.Arith_Tilde: return self._NameInClass(children[0].tok, children[1].tok) raise AssertionError(children[0].tok.id) nt_name = self.number2symbol[typ] raise NotImplementedError(nt_name)
def _ReadLikeDQ(self, left_dq_token, out_parts): # type: (Optional[Token], List[word_part_t]) -> None """ Args: left_dq_token: A token if we are reading a double quoted part, or None if we're reading a here doc. out_parts: list of word_part to append to """ done = False while not done: self._Next(lex_mode_e.DQ) self._Peek() if self.token_kind == Kind.Lit: if self.token_type == Id.Lit_EscapedChar: part = word_part.EscapedLiteral(self.cur_token) # type: word_part_t else: part = self.cur_token out_parts.append(part) elif self.token_kind == Kind.Left: part = self._ReadDoubleQuotedLeftParts() out_parts.append(part) elif self.token_kind == Kind.VSub: part = simple_var_sub(self.cur_token) out_parts.append(part) # NOTE: parsing "$f(x)" would BREAK CODE. Could add a more for it # later. elif self.token_kind == Kind.Right: assert self.token_type == Id.Right_DoubleQuote, self.token_type if left_dq_token: done = True else: # In a here doc, the right quote is literal! out_parts.append(self.cur_token) elif self.token_kind == Kind.Eof: if left_dq_token: p_die('Unexpected EOF reading double-quoted string that began here', token=left_dq_token) else: # here docs will have an EOF in their token stream done = True else: raise AssertionError(self.cur_token)
def Expr(self, pnode): # type: (PNode) -> expr_t """Transform expressions (as opposed to statements).""" typ = pnode.typ tok = pnode.tok children = pnode.children if ISNONTERMINAL(typ): # # Oil Entry Points / Additions # if typ == grammar_nt.oil_expr: # for if/while # oil_expr: '(' testlist ')' return self.Expr(children[1]) if typ == grammar_nt.command_expr: # return_expr: testlist end_stmt return self.Expr(children[0]) # # Python-like Expressions / Operators # if typ == grammar_nt.atom: if len(children) == 1: return self.Expr(children[0]) return self._Atom(children) if typ == grammar_nt.testlist: # testlist: test (',' test)* [','] return self._Tuple(children) if typ == grammar_nt.test: # test: or_test ['if' or_test 'else' test] | lambdef if len(children) == 1: return self.Expr(children[0]) # TODO: Handle lambdef test = self.Expr(children[2]) body = self.Expr(children[0]) orelse = self.Expr(children[4]) return expr.IfExp(test, body, orelse) if typ == grammar_nt.lambdef: # lambdef: '|' [name_type_list] '|' test n = len(children) if n == 4: params = self._NameTypeList(children[1]) else: params = [] body = self.Expr(children[n-1]) return expr.Lambda(params, body) # # Operators with Precedence # if typ == grammar_nt.or_test: # or_test: and_test ('or' and_test)* return self._AssocBinary(children) if typ == grammar_nt.and_test: # and_test: not_test ('and' not_test)* return self._AssocBinary(children) if typ == grammar_nt.not_test: # not_test: 'not' not_test | comparison if len(children) == 1: return self.Expr(children[0]) op_tok = children[0].tok # not return expr.Unary(op_tok, self.Expr(children[1])) elif typ == grammar_nt.comparison: if len(children) == 1: return self.Expr(children[0]) return self._CompareChain(children) elif typ == grammar_nt.range_expr: n = len(children) if n == 1: return self.Expr(children[0]) if n == 3: return expr.Range( self.Expr(children[0]), self.Expr(children[2]) ) raise AssertionError(n) elif typ == grammar_nt.expr: # expr: xor_expr ('|' xor_expr)* return self._AssocBinary(children) if typ == grammar_nt.xor_expr: # xor_expr: and_expr ('xor' and_expr)* return self._AssocBinary(children) if typ == grammar_nt.and_expr: # a & b # and_expr: shift_expr ('&' shift_expr)* return self._AssocBinary(children) elif typ == grammar_nt.shift_expr: # shift_expr: arith_expr (('<<'|'>>') arith_expr)* return self._AssocBinary(children) elif typ == grammar_nt.arith_expr: # arith_expr: term (('+'|'-') term)* return self._AssocBinary(children) elif typ == grammar_nt.term: # term: factor (('*'|'/'|'div'|'mod') factor)* return self._AssocBinary(children) elif typ == grammar_nt.factor: # factor: ('+'|'-'|'~') factor | power # the power would have already been reduced if len(children) == 1: return self.Expr(children[0]) assert len(children) == 2 op = children[0] e = children[1] assert isinstance(op.tok, Token) return expr.Unary(op.tok, self.Expr(e)) elif typ == grammar_nt.power: # power: atom trailer* ['^' factor] node = self.Expr(children[0]) if len(children) == 1: # No trailers return node n = len(children) i = 1 while i < n and ISNONTERMINAL(children[i].typ): node = self._Trailer(node, children[i]) i += 1 if i != n: # ['^' factor] op_tok = children[i].tok assert op_tok.id == Id.Arith_Caret, op_tok factor = self.Expr(children[i+1]) node = expr.Binary(op_tok, node, factor) return node elif typ == grammar_nt.array_literal: left_tok = children[0].tok items = [self._ArrayItem(p) for p in children[1:-1]] return expr.ArrayLiteral(left_tok, items) elif typ == grammar_nt.oil_expr_sub: return self.Expr(children[0]) # # Oil Lexer Modes # elif typ == grammar_nt.sh_array_literal: return cast(sh_array_literal, children[1].tok) elif typ == grammar_nt.sh_command_sub: return cast(command_sub, children[1].tok) elif typ == grammar_nt.braced_var_sub: return cast(braced_var_sub, children[1].tok) elif typ == grammar_nt.dq_string: return cast(double_quoted, children[1].tok) elif typ == grammar_nt.sq_string: return cast(single_quoted, children[1].tok) elif typ == grammar_nt.simple_var_sub: return simple_var_sub(children[0].tok) else: nt_name = self.number2symbol[typ] raise AssertionError( "PNode type %d (%s) wasn't handled" % (typ, nt_name)) else: # Terminals should have a token id_ = tok.id if id_ == Id.Expr_Name: return expr.Var(tok) if id_ in ( Id.Expr_DecInt, Id.Expr_BinInt, Id.Expr_OctInt, Id.Expr_HexInt, Id.Expr_Float): return expr.Const(tok) if id_ in (Id.Expr_Null, Id.Expr_True, Id.Expr_False): return expr.Const(tok) raise NotImplementedError(Id_str(id_))
def _ReAtom(self, p_atom): # type: (PNode) -> re_t """ re_atom: ( char_literal """ assert p_atom.typ == grammar_nt.re_atom, p_atom.typ children = p_atom.children typ = children[0].typ if ISNONTERMINAL(typ): p_child = p_atom.children[0] if typ == grammar_nt.class_literal: return re.ClassLiteral(False, self._ClassLiteral(p_child)) if typ == grammar_nt.braced_var_sub: return cast(braced_var_sub, p_child.children[1].tok) if typ == grammar_nt.dq_string: return cast(double_quoted, p_child.children[1].tok) if typ == grammar_nt.sq_string: return cast(single_quoted, p_child.children[1].tok) if typ == grammar_nt.simple_var_sub: return simple_var_sub(children[0].tok) if typ == grammar_nt.char_literal: return children[0].tok raise NotImplementedError(typ) else: tok = children[0].tok # Special punctuation if tok.id in (Id.Expr_Dot, Id.Arith_Caret, Id.Expr_Dollar): return speck(tok.id, tok.span_id) # TODO: d digit can turn into PosixClass and PerlClass right here! # It's parsing. if tok.id == Id.Expr_Name: return self._NameInRegex(None, tok) if tok.id == Id.Expr_Symbol: # Validate symbols here, like we validate PerlClass, etc. if tok.val in ('%start', '%end', 'dot'): return tok p_die("Unexpected token %r in regex", tok.val, token=tok) if tok.id == Id.Expr_At: # | '@' Expr_Name return re.Splice(children[1].tok) if tok.id == Id.Arith_Tilde: # | '~' [Expr_Name | class_literal] typ = children[1].typ if ISNONTERMINAL(typ): return re.ClassLiteral(True, self._ClassLiteral(children[1])) else: return self._NameInRegex(tok, children[1].tok) if tok.id == Id.Op_LParen: # | '(' regex ')' # Note: in ERE (d+) is the same as <d+>. That is, Group becomes # Capture. return re.Group(self._Regex(children[1])) if tok.id == Id.Arith_Less: # | '<' regex [':' name_type] '>' regex = self._Regex(children[1]) n = len(children) if n == 5: # TODO: Add type expression # YES # < d+ '.' d+ : ratio Float > # < d+ : month Int > # INVALID # < d+ : month List[int] > name_tok = children[3].children[0].tok else: name_tok = None return re.Capture(regex, name_tok) if tok.id == Id.Arith_Colon: # | ':' '(' regex ')' raise NotImplementedError(Id_str(tok.id)) raise NotImplementedError(Id_str(tok.id))
def _ReAtom(self, p_atom): # type: (PNode) -> re_t """ re_atom: ( char_literal """ assert p_atom.typ == grammar_nt.re_atom, p_atom.typ children = p_atom.children typ = children[0].typ if ISNONTERMINAL(typ): p_child = p_atom.children[0] if typ == grammar_nt.class_literal: return re.ClassLiteral(False, self._ClassLiteral(p_child)) if typ == grammar_nt.braced_var_sub: return cast(braced_var_sub, p_child.children[1].tok) if typ == grammar_nt.dq_string: return cast(double_quoted, p_child.children[1].tok) if typ == grammar_nt.sq_string: return cast(single_quoted, p_child.children[1].tok) if typ == grammar_nt.simple_var_sub: return simple_var_sub(children[0].tok) if typ == grammar_nt.char_literal: return children[0].tok raise NotImplementedError(typ) else: tok = children[0].tok # Special punctuation if tok.id in (Id.Expr_Dot, Id.Arith_Caret, Id.Expr_Dollar): return speck(tok.id, tok.span_id) # TODO: d digit can turn into PosixClass and PerlClass right here! # It's parsing. if tok.id == Id.Expr_Name: return self._NameInRegex(None, tok) if tok.id == Id.Expr_Symbol: # Validate symbols here, like we validate PerlClass, etc. if tok.val in ('%start', '%end', 'dot'): return tok p_die("Unexpected token %r in regex", tok.val, token=tok) if tok.id == Id.Expr_At: # | '@' Expr_Name return re.Splice(children[1].tok) if tok.id == Id.Arith_Tilde: # | '~' [Expr_Name | class_literal] typ = children[1].typ if ISNONTERMINAL(typ): ch = children[1].children return re.ClassLiteral(True, self._ClassLiteral(children[1])) else: return self._NameInRegex(tok, children[1].tok) if tok.id == Id.Op_LParen: # | '(' regex ['as' name_type] ')' # TODO: Add variable return re.Group(self._Regex(children[1])) if tok.id == Id.Arith_Colon: # | ':' '(' regex ')' raise NotImplementedError(tok.id) raise NotImplementedError(tok.id)
def Expr(self, pnode): # type: (PNode) -> expr_t """Transform expressions (as opposed to statements).""" typ = pnode.typ tok = pnode.tok children = pnode.children if ISNONTERMINAL(typ): # # Oil Entry Points / Additions # if typ == grammar_nt.oil_expr: # for if/while # oil_expr: '(' testlist ')' return self.Expr(children[1]) if typ == grammar_nt.return_expr: # return_expr: testlist end_stmt return self.Expr(children[0]) if typ == grammar_nt.place_list: return self._AssocBinary(children) if typ == grammar_nt.place: # place: NAME place_trailer* if len(pnode.children) == 1: return self.Expr(pnode.children[0]) # TODO: Called _Trailer but don't handle ( )? # only [] . -> :: ? raise NotImplementedError # # Python-like Expressions / Operators # if typ == grammar_nt.atom: if len(children) == 1: return self.Expr(children[0]) return self._Atom(children) if typ == grammar_nt.testlist: # testlist: test (',' test)* [','] # We need tuples for Python's 'var a, b = x' and 'for (a, b in x) {' return self._Tuple(children) if typ == grammar_nt.test: # test: or_test ['if' or_test 'else' test] | lambdef if len(children) == 1: return self.Expr(children[0]) # TODO: Handle lambdef test = self.Expr(children[2]) body = self.Expr(children[0]) orelse = self.Expr(children[4]) return expr.IfExp(test, body, orelse) if typ == grammar_nt.test_nocond: # test_nocond: or_test | lambdef_nocond assert len(children) == 1 return self.Expr(children[0]) if typ == grammar_nt.argument: # argument: ( test [comp_for] | # test '=' test | # '**' test | # '*' test ) if len(pnode.children) == 1: return self.Expr(children[0]) # TODO: raise NotImplementedError if typ == grammar_nt.subscript: # subscript: test | [test] ':' [test] [sliceop] if len(pnode.children) == 1: return self.Expr(children[0]) # TODO: raise NotImplementedError if typ == grammar_nt.testlist_comp: # testlist_comp: (test|star_expr) ( comp_for | (',' (test|star_expr))* [','] ) if children[1].typ == grammar_nt.comp_for: elt = self.Expr(children[0]) comp = self._CompFor(children[1]) return expr.ListComp(elt, [comp]) # (1,) (1, 2) etc. if children[1].tok.id == Id.Arith_Comma: return self._Tuple(children) raise NotImplementedError('testlist_comp') elif typ == grammar_nt.exprlist: # exprlist: (expr|star_expr) (',' (expr|star_expr))* [','] if len(children) == 1: return self.Expr(children[0]) # used in for loop, genexpr. # TODO: This sould be placelist? for x, *y ? raise NotImplementedError('exprlist') # # Operators with Precedence # if typ == grammar_nt.or_test: # or_test: and_test ('or' and_test)* return self._AssocBinary(children) if typ == grammar_nt.and_test: # and_test: not_test ('and' not_test)* return self._AssocBinary(children) if typ == grammar_nt.not_test: # not_test: 'not' not_test | comparison if len(children) == 1: return self.Expr(children[0]) op_tok = children[0].tok # not return expr.Unary(op_tok, self.Expr(children[1])) elif typ == grammar_nt.comparison: if len(children) == 1: return self.Expr(children[0]) return self._CompareChain(children) elif typ == grammar_nt.expr: # expr: xor_expr ('|' xor_expr)* return self._AssocBinary(children) if typ == grammar_nt.xor_expr: # xor_expr: and_expr ('xor' and_expr)* return self._AssocBinary(children) if typ == grammar_nt.and_expr: # a & b # and_expr: shift_expr ('&' shift_expr)* return self._AssocBinary(children) elif typ == grammar_nt.shift_expr: # shift_expr: arith_expr (('<<'|'>>') arith_expr)* return self._AssocBinary(children) elif typ == grammar_nt.arith_expr: # arith_expr: term (('+'|'-') term)* return self._AssocBinary(children) elif typ == grammar_nt.term: # term: factor (('*'|'/'|'div'|'mod') factor)* return self._AssocBinary(children) elif typ == grammar_nt.factor: # factor: ('+'|'-'|'~') factor | power # the power would have already been reduced if len(children) == 1: return self.Expr(children[0]) op, e = children assert isinstance(op.tok, token) return expr.Unary(op.tok, self.Expr(e)) elif typ == grammar_nt.power: # power: atom trailer* ['^' factor] node = self.Expr(children[0]) if len(children) == 1: # No trailers return node n = len(children) i = 1 while i < n and ISNONTERMINAL(children[i].typ): node = self._Trailer(node, children[i]) i += 1 if i != n: # ['^' factor] op_tok = children[i].tok assert op_tok.id == Id.Arith_Caret, op_tok factor = self.Expr(children[i+1]) node = expr.Binary(op_tok, node, factor) return node # # Oil Lexer Modes # elif typ == grammar_nt.array_literal: left_tok = children[0].tok # Approximation for now. tokens = [ pnode.tok for pnode in children[1:-1] if pnode.tok.id == Id.Lit_Chars ] items = [expr.Const(t) for t in tokens] # type: List[expr_t] return expr.ArrayLiteral(left_tok, items) elif typ == grammar_nt.sh_array_literal: left_tok = children[0].tok # HACK: When typ is Id.Expr_CastedDummy, the 'tok' field ('opaque') # actually has a list of words! typ1 = children[1].typ assert typ1 == Id.Expr_CastedDummy.enum_id, typ1 array_words = cast('List[word_t]', children[1].tok) return sh_array_literal(left_tok, array_words) elif typ == grammar_nt.sh_command_sub: return cast(command_sub, children[1].tok) elif typ == grammar_nt.braced_var_sub: return cast(braced_var_sub, children[1].tok) elif typ == grammar_nt.dq_string: return cast(double_quoted, children[1].tok) elif typ == grammar_nt.sq_string: return cast(single_quoted, children[1].tok) elif typ == grammar_nt.simple_var_sub: return simple_var_sub(children[0].tok) else: nt_name = self.number2symbol[typ] raise AssertionError( "PNode type %d (%s) wasn't handled" % (typ, nt_name)) else: # Terminals should have a token id_ = tok.id if id_ == Id.Expr_Name: return expr.Var(tok) if id_ in ( Id.Expr_DecInt, Id.Expr_BinInt, Id.Expr_OctInt, Id.Expr_HexInt, Id.Expr_Float): return expr.Const(tok) if id_ in (Id.Expr_Null, Id.Expr_True, Id.Expr_False): return expr.Const(tok) from core.meta import IdInstance raise NotImplementedError(IdInstance(typ))
def _ReadCompoundWord3(self, lex_mode, eof_type, empty_ok): # type: (lex_mode_t, Id_t, bool) -> compound_word """ Precondition: Looking at the first token of the first word part Postcondition: Looking at the token after, e.g. space or operator NOTE: eof_type is necessary because / is a literal, i.e. Lit_Slash, but it could be an operator delimiting a compound word. Can we change lexer modes and remove this special case? """ w = compound_word() num_parts = 0 brace_count = 0 done = False while not done: self._Peek() allow_done = empty_ok or num_parts != 0 if allow_done and self.token_type == eof_type: done = True # e.g. for ${foo//pat/replace} # Keywords like "for" are treated like literals elif self.token_kind in ( Kind.Lit, Kind.History, Kind.KW, Kind.ControlFlow, Kind.BoolUnary, Kind.BoolBinary): if self.token_type == Id.Lit_EscapedChar: part = word_part.EscapedLiteral(self.cur_token) # type: word_part_t else: part = self.cur_token if self.token_type == Id.Lit_VarLike and num_parts == 0: # foo= w.parts.append(part) # Unfortunately it's awkward to pull the check for a=(1 2) up to # _ReadWord. next_id = self.lexer.LookAhead(lex_mode) if next_id == Id.Op_LParen: self.lexer.PushHint(Id.Op_RParen, Id.Right_ShArrayLiteral) part2 = self._ReadArrayLiteral() w.parts.append(part2) # Array literal must be the last part of the word. self._Next(lex_mode) self._Peek() # EOF, whitespace, newline, Right_Subshell if self.token_kind not in KINDS_THAT_END_WORDS: p_die('Unexpected token after array literal', token=self.cur_token) done = True elif (self.parse_opts.parse_at() and self.token_type == Id.Lit_Splice and num_parts == 0): splice_token = self.cur_token next_id = self.lexer.LookAhead(lex_mode) if next_id == Id.Op_LParen: # @arrayfunc(x) arglist = arg_list() self._ParseCallArguments(arglist) part = word_part.FuncCall(splice_token, arglist) else: part = word_part.Splice(splice_token) w.parts.append(part) # @words or @arrayfunc() must be the last part of the word self._Next(lex_mode) self._Peek() # EOF, whitespace, newline, Right_Subshell if self.token_kind not in KINDS_THAT_END_WORDS: p_die('Unexpected token after array splice', token=self.cur_token) done = True else: # Syntax error for { and } if self.token_type == Id.Lit_LBrace: brace_count += 1 elif self.token_type == Id.Lit_RBrace: brace_count -= 1 # not a literal with lookahead; append it w.parts.append(part) elif self.token_kind == Kind.VSub: vsub_token = self.cur_token part = simple_var_sub(vsub_token) if self.token_type == Id.VSub_DollarName: # Look ahead for $strfunc(x) # $f(x) or --name=$f(x) is allowed # but "--name=$f(x)" not allowed? This would BREAK EXISTING CODE. # It would need a parse option. next_id = self.lexer.LookAhead(lex_mode) if next_id == Id.Op_LParen: arglist = arg_list() self._ParseCallArguments(arglist) part = word_part.FuncCall(vsub_token, arglist) # Unlike @arrayfunc(x), it makes sense to allow $f(1)$f(2) # var a = f(1); var b = f(2); echo $a$b # It's consistent with other uses of $. w.parts.append(part) elif self.token_kind == Kind.ExtGlob: part = self._ReadExtGlob() w.parts.append(part) elif self.token_kind == Kind.Left: part = self._ReadLeftParts() w.parts.append(part) # NOT done yet, will advance below elif self.token_kind == Kind.Right: # Still part of the word; will be done on the next iter. if self.token_type == Id.Right_DoubleQuote: pass # Never happens, no PushHint for this case. #elif self.token_type == Id.Right_DollarParen: # pass elif self.token_type == Id.Right_Subshell: # LEXER HACK for (case x in x) ;; esac ) # Rewind before it's used assert self.next_lex_mode == lex_mode_e.Undefined if self.lexer.MaybeUnreadOne(): self.lexer.PushHint(Id.Op_RParen, Id.Right_Subshell) self._Next(lex_mode) done = True else: done = True elif self.token_kind == Kind.Ignored: done = True else: # LEXER HACK for unbalanced case clause. 'case foo in esac' is valid, # so to test for ESAC, we can read ) before getting a chance to # PushHint(Id.Op_RParen, Id.Right_CasePat). So here we unread one # token and do it again. # We get Id.Op_RParen at top level: case x in x) ;; esac # We get Id.Eof_RParen inside ComSub: $(case x in x) ;; esac ) if self.token_type in (Id.Op_RParen, Id.Eof_RParen): # Rewind before it's used assert self.next_lex_mode == lex_mode_e.Undefined if self.lexer.MaybeUnreadOne(): if self.token_type == Id.Eof_RParen: # Redo translation self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen) self._Next(lex_mode) done = True # anything we don't recognize means we're done if not done: self._Next(lex_mode) num_parts += 1 if self.parse_opts.parse_brace() and num_parts > 1 and brace_count != 0: # accept { and }, but not foo{ p_die( 'Word has unbalanced { }. Maybe add a space or quote it like \{', word=w) return w