def _ReadExtGlob(self): # type: () -> word_part__ExtGlob """ Grammar: Item = word.Compound | EPSILON # important: @(foo|) is allowed LEFT = '@(' | '*(' | '+(' | '?(' | '!(' RIGHT = ')' ExtGlob = LEFT (Item '|')* Item RIGHT # ITEM may be empty Compound includes ExtGlob """ left_token = self.cur_token arms = [] # type: List[word_t] spids = [] spids.append(left_token.span_id) self.lexer.PushHint(Id.Op_RParen, Id.Right_ExtGlob) self._Next(lex_mode_e.ExtGlob) # advance past LEFT read_word = False # did we just a read a word? To handle @(||). while True: self._Peek() if self.token_type == Id.Right_ExtGlob: if not read_word: arms.append(word.Compound()) spids.append(self.cur_token.span_id) break elif self.token_type == Id.Op_Pipe: if not read_word: arms.append(word.Compound()) read_word = False self._Next(lex_mode_e.ExtGlob) # lex mode EXTGLOB should only produce these 4 kinds of tokens elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub, Kind.ExtGlob): w = self._ReadCompoundWord(lex_mode=lex_mode_e.ExtGlob) arms.append(w) read_word = True elif self.token_kind == Kind.Eof: p_die('Unexpected EOF reading extended glob that began here', token=left_token) else: raise AssertionError('Unexpected token %r' % self.cur_token) part = word_part.ExtGlob(left_token, arms) part.spids.extend(spids) return part
def testVarOps(self): ev = InitEvaluator() # initializes x=xxx and y=yyy unset_sub = word_part.BracedVarSub(token(Id.VSub_Name, 'unset')) part_vals = [] ev._EvalWordPart(unset_sub, part_vals) print(part_vals) set_sub = word_part.BracedVarSub(token(Id.VSub_Name, 'x')) part_vals = [] ev._EvalWordPart(set_sub, part_vals) print(part_vals) # Now add some ops part = word_part.Literal(token(Id.Lit_Chars, 'default')) arg_word = word.Compound([part]) test_op = suffix_op.Unary(Id.VTest_ColonHyphen, arg_word) unset_sub.suffix_op = test_op set_sub.suffix_op = test_op part_vals = [] ev._EvalWordPart(unset_sub, part_vals) print(part_vals) part_vals = [] ev._EvalWordPart(set_sub, part_vals) print(part_vals)
def BraceExpandWords(words): # type: (List[word__Compound]) -> List[word__Compound] out = [] # type: List[word__Compound] for w in words: if isinstance(w, word__BracedTree): parts_list = _BraceExpand(w.parts) out.extend(word.Compound(p) for p in parts_list) else: out.append(w) return out
def ReadForPlugin(self): # type: () -> word__Compound """For $PS1, $PS4, etc. This is just like reading a here doc line. "\n" is allowed, as well as the typical substitutions ${x} $(echo hi) $((1 + 2)). """ w = word.Compound() self._ReadLikeDQ(None, w.parts) return w
def testBraceExpand(self): w = _assertReadWord(self, 'hi') results = braces._BraceExpand(w.parts) self.assertEqual(1, len(results)) for parts in results: _PrettyPrint(word.Compound(parts)) print('') w = _assertReadWord(self, 'B-{a,b}-E') tree = braces._BraceDetect(w) self.assertEqual(3, len(tree.parts)) _PrettyPrint(tree) results = braces._BraceExpand(tree.parts) self.assertEqual(2, len(results)) for parts in results: _PrettyPrint(word.Compound(parts)) print('') w = _assertReadWord(self, 'B-{a,={b,c,d}=,e}-E') tree = braces._BraceDetect(w) self.assertEqual(3, len(tree.parts)) _PrettyPrint(tree) results = braces._BraceExpand(tree.parts) self.assertEqual(5, len(results)) for parts in results: _PrettyPrint(word.Compound(parts)) print('') w = _assertReadWord(self, 'B-{a,b}-{c,d}-E') tree = braces._BraceDetect(w) self.assertEqual(5, len(tree.parts)) _PrettyPrint(tree) results = braces._BraceExpand(tree.parts) self.assertEqual(4, len(results)) for parts in results: _PrettyPrint(word.Compound(parts)) print('')
def TildeDetect(w): # type: (word_t) -> Optional[word_t] """Detect tilde expansion in a word. It might begin with Literal that needs to be turned into a TildeSub. (It depends on whether the second token begins with slash). If so, it return a new word. Otherwise return None. NOTE: - The regex for Lit_TildeLike could be expanded. Right now it's conservative, like Lit_Chars without the /. - It's possible to write this in a mutating style, since only the first token is changed. But note that we CANNOT know this during lexing. """ # NOTE: BracedTree, Empty, etc. can't be tilde expanded if not isinstance(w, word__Compound): return None assert w.parts, w part0 = w.parts[0] if _LiteralId(part0) != Id.Lit_TildeLike: return None assert isinstance(part0, word_part__Literal) # for MyPy if len(w.parts) == 1: # can't be zero tilde_part = word_part.TildeSub(part0.token) return word.Compound([tilde_part]) part1 = w.parts[1] # NOTE: We could inspect the raw tokens. if _LiteralId(part1) == Id.Lit_Chars: assert isinstance(part1, word_part__Literal) # for MyPy if part1.token.val.startswith('/'): tilde_part_ = word_part.TildeSub(part0.token) # type: word_part_t return word.Compound([tilde_part_] + w.parts[1:]) # It could be something like '~foo:bar', which doesn't have a slash. return None
def DetectAssocPair(w): # type: (word__Compound) -> Optional[Tuple[word__Compound, word__Compound]] """ Like DetectShAssignment, but for A=(['k']=v ['k2']=v) The key and the value are both strings. So we just pick out word_part. Unlike a[k]=v, A=([k]=v) is NOT ambiguous, because the [k] syntax is only used for associative array literals, as opposed to indexed array literals. """ parts = w.parts if _LiteralId(parts[0]) != Id.Lit_LBracket: return None n = len(parts) for i in xrange(n): id_ = _LiteralId(parts[i]) if id_ == Id.Lit_ArrayLhsClose: # ]= # e.g. if we have [$x$y]=$a$b key = word.Compound(parts[1:i]) # $x$y value = word.Compound(parts[i + 1:]) # $a$b from return key, value return None
def testMultiLine(self): w_parser = test_lib.InitWordParser("""\ ls foo # Multiple newlines and comments should be ignored ls bar """) print('--MULTI') w = w_parser.ReadWord(lex_mode_e.ShCommand) parts = [word_part.Literal(token(Id.Lit_Chars, 'ls'))] test_lib.AssertAsdlEqual(self, word.Compound(parts), w) w = w_parser.ReadWord(lex_mode_e.ShCommand) parts = [word_part.Literal(token(Id.Lit_Chars, 'foo'))] test_lib.AssertAsdlEqual(self, word.Compound(parts), w) w = w_parser.ReadWord(lex_mode_e.ShCommand) t = token(Id.Op_Newline, '\n') test_lib.AssertAsdlEqual(self, word.Token(t), w) w = w_parser.ReadWord(lex_mode_e.ShCommand) parts = [word_part.Literal(token(Id.Lit_Chars, 'ls'))] test_lib.AssertAsdlEqual(self, word.Compound(parts), w) w = w_parser.ReadWord(lex_mode_e.ShCommand) parts = [word_part.Literal(token(Id.Lit_Chars, 'bar'))] test_lib.AssertAsdlEqual(self, word.Compound(parts), w) w = w_parser.ReadWord(lex_mode_e.ShCommand) t = token(Id.Op_Newline, '\n') test_lib.AssertAsdlEqual(self, word.Token(t), w) w = w_parser.ReadWord(lex_mode_e.ShCommand) t = token(Id.Eof_Real, '') test_lib.AssertAsdlEqual(self, word.Token(t), w)
def ErrorWord(fmt, err): # type: (str, _ErrorWithLocation) -> word__Compound error_str = fmt % err.UserErrorString() t = token(Id.Lit_Chars, error_str, const.NO_INTEGER) return word.Compound([word_part.Literal(t)])
def _BraceDetect(w): # type: (word__Compound) -> Optional[word__BracedTree] """Return a new word if the input word looks like a brace expansion. e.g. {a,b} or {1..10..2} (TODO) Do we want to accept {01..02} ? zsh does make some attempt to do this too. NOTE: This is an iterative algorithm that uses a stack. The grammar-based approach didn't seem natural. It's not LL(1) because of 'part*'. And not LL(k) even? Maybe it be handled with an LR parser? In any case the imperative algorithm with 'early return' for a couple cases is fairly simple. Grammar: # an alternative is a literal, possibly empty, or another brace_expr part = <any part except Literal> alt = part* | brace_expr # a brace_expr is group of at least 2 braced and comma-separated # alternatives, with optional prefix and suffix. brace_expr = part* '{' alt ',' alt (',' alt)* '}' part* """ # Errors: # }a{ - stack depth dips below 0 # {a,b}{ - Stack depth doesn't end at 0 # {a} - no comma, and also not an numeric range cur_parts = [] # type: List[word_part_t] stack = [] # type: List[_StackFrame] found = False for i, part in enumerate(w.parts): append = True if isinstance(part, word_part__Literal): id_ = part.token.id if id_ == Id.Lit_LBrace: # Save prefix parts. Start new parts list. new_frame = _StackFrame(cur_parts) stack.append(new_frame) cur_parts = [] append = False found = True # assume found, but can early exit with None later elif id_ == Id.Lit_Comma: # Append a new alternative. # NOTE: Should we allow this: # ,{a,b} # or force this: # \,{a,b} # ? We're forcing braces right now but not commas. if stack: stack[-1].saw_comma = True stack[-1].alt_part.words.append(word.Compound(cur_parts)) cur_parts = [] # clear append = False elif id_ == Id.Lit_RBrace: if not stack: # e.g. echo {a,b}{ -- unbalanced { return None # do not expand ANYTHING because of invalid syntax # Detect {1..10} and {1..10..2} #log('stack[-1]: %s', stack[-1]) #log('cur_parts: %s', cur_parts) range_part = None # only allow {1..3}, not {a,1..3} if not stack[-1].saw_comma and len(cur_parts) == 1: # It must be ONE part. For example, -1..-100..-2 is initially # lexed as a single Lit_Chars token. part = cur_parts[0] if (isinstance(part, word_part__Literal) and part.token.id == Id.Lit_Chars): range_part = _RangePartDetect(part.token) if range_part: frame = stack.pop() cur_parts = frame.cur_parts cur_parts.append(range_part) append = False # It doesn't look like a range -- process it as the last element in # {a,b,c} if not range_part: if not stack[ -1].saw_comma: # {foo} is not a real alternative return None # early return stack[-1].alt_part.words.append(word.Compound(cur_parts)) frame = stack.pop() cur_parts = frame.cur_parts cur_parts.append(frame.alt_part) append = False if append: cur_parts.append(part) if len(stack) != 0: return None if found: return word.BracedTree(cur_parts) else: return None
def Expr(self, pnode): # type: (PNode) -> expr_t """Transform expressions (as opposed to statements).""" typ = pnode.typ tok = pnode.tok children = pnode.children if ISNONTERMINAL(typ): c = '-' if not children else len(children) #log('non-terminal %s %s', nt_name, c) if typ == grammar_nt.oil_expr: # for if/while # oil_expr: '(' testlist ')' return self.Expr(children[1]) if typ == grammar_nt.return_expr: # for if/while # return_expr: testlist end_stmt return self.Expr(children[0]) if typ == grammar_nt.lvalue_list: return self._AssocBinary(children) if typ == grammar_nt.atom: return self.atom(children) if typ == grammar_nt.eval_input: # testlist_input: testlist NEWLINE* ENDMARKER return self.Expr(children[0]) if typ == grammar_nt.testlist: # testlist: test (',' test)* [','] return self._AssocBinary(children) elif typ == grammar_nt.arith_expr: # expr: term (('+'|'-') term)* return self._AssocBinary(children) elif typ == grammar_nt.term: # term: factor (('*'|'/'|'div'|'mod') factor)* return self._AssocBinary(children) elif typ == grammar_nt.expr: # expr: xor_expr ('|' xor_expr)* return self._AssocBinary(children) elif typ == grammar_nt.shift_expr: # shift_expr: arith_expr (('<<'|'>>') arith_expr)* return self._AssocBinary(children) elif typ == grammar_nt.comparison: # comparison: expr (comp_op expr)* return self._AssocBinary(children) elif typ == grammar_nt.factor: # factor: ('+'|'-'|'~') factor | power # the power would have already been reduced assert len(children) == 2, children op, e = children assert isinstance(op.tok, token) return expr.Unary(op.tok, self.Expr(e)) elif typ == grammar_nt.atom_expr: # atom_expr: ['await'] atom trailer* # NOTE: This would be shorter in a recursive style. base = self.Expr(children[0]) n = len(children) for i in xrange(1, n): pnode = children[i] tok = pnode.tok base = self.trailer(base, pnode) return base elif typ == grammar_nt.power: # power: atom_expr ['^' factor] # This doesn't repeat, so it doesn't matter if it's left or right # associative. return self._AssocBinary(children) elif typ == grammar_nt.array_literal: left_tok = children[0].tok # Approximation for now. tokens = [ pnode.tok for pnode in children[1:-1] if pnode.tok.id == Id.Lit_Chars ] items = [expr.Const(t) for t in tokens] # type: List[expr_t] return expr.ArrayLiteral(left_tok, items) elif typ == grammar_nt.sh_array_literal: left_tok = children[0].tok # HACK: When typ is Id.Expr_WordsDummy, the 'tok' field ('opaque') # actually has a list of words! typ1 = children[1].typ assert typ1 == Id.Expr_WordsDummy.enum_id, typ1 array_words = cast('List[word_t]', children[1].tok) return expr.ShellArrayLiteral(left_tok, array_words) elif typ == grammar_nt.regex_literal: left_tok = children[0].tok # Approximation for now. tokens = [ pnode.tok for pnode in children[1:-1] if pnode.tok.id == Id.Expr_Name ] parts = [regex.Var(t) for t in tokens] # type: List[regex_t] return expr.RegexLiteral(left_tok, regex.Concat(parts)) elif typ == grammar_nt.command_sub: left_tok = children[0].tok # Approximation for now. tokens = [ pnode.tok for pnode in children[1:-1] if pnode.tok.id == Id.Lit_Chars ] words = [ word.Compound([word_part.Literal(t)]) for t in tokens ] # type: List[word_t] return expr.CommandSub(left_tok, command.Simple(words)) elif typ == grammar_nt.sh_command_sub: left_tok = children[0].tok # HACK: When typ is Id.Expr_CommandDummy, the 'tok' field ('opaque') # actually has a word_part.CommandSub! typ1 = children[1].typ assert typ1 == Id.Expr_CommandDummy.enum_id, typ1 cs_part = cast(word_part__CommandSub, children[1].tok) # Awkward: the schemas are different expr_part = expr.CommandSub(cs_part.left_token, cs_part.command_list) expr_part.spids.extend(cs_part.spids) return expr_part elif typ == grammar_nt.var_sub: left_tok = children[0].tok return expr.VarSub(left_tok, self.Expr(children[1])) elif typ == grammar_nt.dq_string: left_tok = children[0].tok tokens = [ pnode.tok for pnode in children[1:-1] if pnode.tok.id == Id.Lit_Chars ] parts2 = [word_part.Literal(t) for t in tokens] # type: List[word_part_t] return expr.DoubleQuoted(left_tok, parts2) else: nt_name = self.number2symbol[typ] raise AssertionError("PNode type %d (%s) wasn't handled" % (typ, nt_name)) else: # Terminals should have a token #log('terminal %s', tok) if tok.id == Id.Expr_Name: return expr.Var(tok) elif tok.id == Id.Expr_Digits: return expr.Const(tok) else: raise AssertionError(tok.id)
def _ReadCompoundWord(self, eof_type=Id.Undefined_Tok, lex_mode=lex_mode_e.ShCommand, empty_ok=True): # type: (Id_t, lex_mode_t, bool) -> word__Compound """ Precondition: Looking at the first token of the first word part Postcondition: Looking at the token after, e.g. space or operator NOTE: eof_type is necessary because / is a literal, i.e. Lit_Slash, but it could be an operator delimiting a compound word. Can we change lexer modes and remove this special case? """ w = word.Compound() num_parts = 0 brace_count = 0 done = False while not done: self._Peek() allow_done = empty_ok or num_parts != 0 if allow_done and self.token_type == eof_type: done = True # e.g. for ${foo//pat/replace} # Keywords like "for" are treated like literals elif self.token_kind in ( Kind.Lit, Kind.History, Kind.KW, Kind.ControlFlow, Kind.BoolUnary, Kind.BoolBinary): if self.token_type == Id.Lit_EscapedChar: part = word_part.EscapedLiteral(self.cur_token) # type: word_part_t else: part = word_part.Literal(self.cur_token) if self.token_type == Id.Lit_VarLike and num_parts == 0: # foo= w.parts.append(part) # Unfortunately it's awkward to pull the check for a=(1 2) up to # _ReadWord. t = self.lexer.LookAhead(lex_mode_e.ShCommand) if t.id == Id.Op_LParen: self.lexer.PushHint(Id.Op_RParen, Id.Right_ShArrayLiteral) part2 = self._ReadArrayLiteral() w.parts.append(part2) # Array literal must be the last part of the word. self._Next(lex_mode) self._Peek() # EOF, whitespace, newline, Right_Subshell if self.token_kind not in self.KINDS_THAT_END_WORDS: p_die('Unexpected token after array literal', token=self.cur_token) done = True elif (self.parse_opts.at and self.token_type == Id.Lit_Splice and num_parts == 0): splice_token = self.cur_token t = self.lexer.LookAhead(lex_mode_e.ShCommand) if t.id == Id.Op_LParen: # @arrayfunc(x) arglist = arg_list() self._ParseCallArguments(arglist) part = word_part.FuncCall(splice_token, arglist) else: part = word_part.Splice(splice_token) w.parts.append(part) # @words or @arrayfunc() must be the last part of the word self._Next(lex_mode) self._Peek() # EOF, whitespace, newline, Right_Subshell if self.token_kind not in self.KINDS_THAT_END_WORDS: p_die('Unexpected token after array splice', token=self.cur_token) done = True else: # Syntax error for { and } if self.token_type == Id.Lit_LBrace: brace_count += 1 elif self.token_type == Id.Lit_RBrace: brace_count -= 1 # not a literal with lookahead; append it w.parts.append(part) elif self.token_kind == Kind.VSub: vsub_token = self.cur_token part = simple_var_sub(vsub_token) if self.token_type == Id.VSub_DollarName: # Look ahead for $strfunc(x) # $f(x) or --name=$f(x) is allowed # but "--name=$f(x)" not allowed? This would BREAK EXISTING CODE. # It would need a parse option. t = self.lexer.LookAhead(lex_mode_e.ShCommand) if t.id == Id.Op_LParen: arglist = arg_list() self._ParseCallArguments(arglist) part = word_part.FuncCall(vsub_token, arglist) # Unlike @arrayfunc(x), it makes sense to allow $f(1)$f(2) # var a = f(1); var b = f(2); echo $a$b # It's consistent with other uses of $. w.parts.append(part) elif self.token_kind == Kind.ExtGlob: part = self._ReadExtGlob() w.parts.append(part) elif self.token_kind == Kind.Left: part = self._ReadLeftParts() w.parts.append(part) # NOT done yet, will advance below elif self.token_kind == Kind.Right: # Still part of the word; will be done on the next iter. if self.token_type == Id.Right_DoubleQuote: pass # Never happens, no PushHint for this case. #elif self.token_type == Id.Right_DollarParen: # pass elif self.token_type == Id.Right_Subshell: # LEXER HACK for (case x in x) ;; esac ) assert self.next_lex_mode is None # Rewind before it's used if self.lexer.MaybeUnreadOne(): self.lexer.PushHint(Id.Op_RParen, Id.Right_Subshell) self._Next(lex_mode) done = True else: done = True elif self.token_kind == Kind.Ignored: done = True else: # LEXER HACK for unbalanced case clause. 'case foo in esac' is valid, # so to test for ESAC, we can read ) before getting a chance to # PushHint(Id.Op_RParen, Id.Right_CasePat). So here we unread one # token and do it again. # We get Id.Op_RParen at top level: case x in x) ;; esac # We get Id.Eof_RParen inside ComSub: $(case x in x) ;; esac ) if self.token_type in (Id.Op_RParen, Id.Eof_RParen): assert self.next_lex_mode is None # Rewind before it's used if self.lexer.MaybeUnreadOne(): if self.token_type == Id.Eof_RParen: # Redo translation self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen) self._Next(lex_mode) done = True # anything we don't recognize means we're done if not done: self._Next(lex_mode) num_parts += 1 if self.parse_opts.brace and num_parts > 1 and brace_count != 0: # accept { and }, but not foo{ p_die( 'Word has unbalanced { }. Maybe add a space or quote it like \{', word=w) return w