def _CompareChain(self, children): # type: (List[PNode]) -> expr_t """ comparison: expr (comp_op expr)* """ cmp_ops = [] # type: List[speck] comparators = [] # type: List[expr_t] left = self.Expr(children[0]) i = 1 n = len(children) while i < n: op_children = children[i].children tok1 = op_children[0].tok if len(op_children) == 2: # Blame the first token if tok1.id == Id.Expr_Not: # not in op = speck(Id.Node_NotIn, tok1.span_id) elif tok1.id == Id.Expr_Is: # is not op = speck(Id.Node_IsNot, tok1.span_id) else: raise AssertionError() else: # is, <, ==, etc. op = speck(tok1.id, tok1.span_id) cmp_ops.append(op) i += 1 comparators.append(self.Expr(children[i])) i += 1 return expr.Compare(left, cmp_ops, comparators)
def _NameInClass(self, negated_tok, tok): # type: (Token, Token) -> class_literal_term_t """ Like the above, but 'dot' doesn't mean anything. And `d` is a literal 'd', not `digit`. """ if negated_tok: # For error messages negated_speck = speck(negated_tok.id, negated_tok.span_id) else: negated_speck = None val = tok.val # A bare, unquoted character literal. In the grammar, this is expressed as # range_char without an ending. # d is NOT 'digit', it's a literal 'd'! if len(val) == 1: # Expr_Name matches VAR_NAME_RE, which starts with [a-zA-Z_] assert tok.id in (Id.Expr_Name, Id.Expr_DecInt) if negated_tok: # [~d] is not allowed, only [~digit] p_die("Can't negate this symbol", token=tok) return class_literal_term.CharLiteral(tok) # digit, word, but not d, w, etc. if val in POSIX_CLASSES: return posix_class(negated_speck, val) perl = PERL_CLASSES.get(val) if perl is not None: return perl_class(negated_speck, perl) p_die("%r isn't a character class", val, token=tok)
def _NameInRegex(self, negated_tok, tok): # type: (Token, Token) -> re_t if negated_tok: # For error messages negated_speck = speck(negated_tok.id, negated_tok.span_id) else: negated_speck = None val = tok.val if val == 'dot': if negated_tok: p_die("Can't negate this symbol", token=tok) return tok if val in POSIX_CLASSES: return posix_class(negated_speck, val) perl = PERL_CLASSES.get(val) if perl is not None: return perl_class(negated_speck, perl) if val[0].isupper(): # e.g. HexDigit return re.Splice(tok) p_die("%r isn't a character class", val, token=tok)
def _NameInClass(self, negated_tok, tok): # type: (token, token) -> class_literal_term_t """ Like the above, but 'dot' doesn't mean anything. """ if negated_tok: # For error messages negated_speck = speck(negated_tok.id, negated_tok.span_id) else: negated_speck = None val = tok.val if val in self.POSIX_CLASSES: return posix_class(negated_speck, val) perl = self.PERL_CLASSES.get(val) if perl: return perl_class(negated_speck, perl) p_die("%r isn't a character class", val, token=tok)
def _NameInRegex(self, negated_tok, tok): # type: (token, token) -> re_t if negated_tok: # For error messages negated_speck = speck(negated_tok.id, negated_tok.span_id) else: negated_speck = None val = tok.val if val == 'dot': if negated_tok: p_die("Can't negate this symbol", token=tok) return tok if val in self.POSIX_CLASSES: return posix_class(negated_speck, val) perl = self.PERL_CLASSES.get(val) if perl: return perl_class(negated_speck, perl) p_die("%r isn't a character class", val, token=tok)
def _ReAtom(self, p_atom): # type: (PNode) -> re_t """ re_atom: ( char_literal """ assert p_atom.typ == grammar_nt.re_atom, p_atom.typ children = p_atom.children typ = children[0].typ if ISNONTERMINAL(typ): p_child = p_atom.children[0] if typ == grammar_nt.class_literal: return re.ClassLiteral(False, self._ClassLiteral(p_child)) if typ == grammar_nt.braced_var_sub: return cast(braced_var_sub, p_child.children[1].tok) if typ == grammar_nt.dq_string: return cast(double_quoted, p_child.children[1].tok) if typ == grammar_nt.sq_string: return cast(single_quoted, p_child.children[1].tok) if typ == grammar_nt.simple_var_sub: return simple_var_sub(children[0].tok) if typ == grammar_nt.char_literal: return children[0].tok raise NotImplementedError(typ) else: tok = children[0].tok # Special punctuation if tok.id in (Id.Expr_Dot, Id.Arith_Caret, Id.Expr_Dollar): return speck(tok.id, tok.span_id) # TODO: d digit can turn into PosixClass and PerlClass right here! # It's parsing. if tok.id == Id.Expr_Name: return self._NameInRegex(None, tok) if tok.id == Id.Expr_Symbol: # Validate symbols here, like we validate PerlClass, etc. if tok.val in ('%start', '%end', 'dot'): return tok p_die("Unexpected token %r in regex", tok.val, token=tok) if tok.id == Id.Expr_At: # | '@' Expr_Name return re.Splice(children[1].tok) if tok.id == Id.Arith_Tilde: # | '~' [Expr_Name | class_literal] typ = children[1].typ if ISNONTERMINAL(typ): return re.ClassLiteral(True, self._ClassLiteral(children[1])) else: return self._NameInRegex(tok, children[1].tok) if tok.id == Id.Op_LParen: # | '(' regex ')' # Note: in ERE (d+) is the same as <d+>. That is, Group becomes # Capture. return re.Group(self._Regex(children[1])) if tok.id == Id.Arith_Less: # | '<' regex [':' name_type] '>' regex = self._Regex(children[1]) n = len(children) if n == 5: # TODO: Add type expression # YES # < d+ '.' d+ : ratio Float > # < d+ : month Int > # INVALID # < d+ : month List[int] > name_tok = children[3].children[0].tok else: name_tok = None return re.Capture(regex, name_tok) if tok.id == Id.Arith_Colon: # | ':' '(' regex ')' raise NotImplementedError(Id_str(tok.id)) raise NotImplementedError(Id_str(tok.id))
def _ReAtom(self, p_atom): # type: (PNode) -> re_t """ re_atom: ( char_literal """ assert p_atom.typ == grammar_nt.re_atom, p_atom.typ children = p_atom.children typ = children[0].typ if ISNONTERMINAL(typ): p_child = p_atom.children[0] if typ == grammar_nt.class_literal: return re.ClassLiteral(False, self._ClassLiteral(p_child)) if typ == grammar_nt.braced_var_sub: return cast(braced_var_sub, p_child.children[1].tok) if typ == grammar_nt.dq_string: return cast(double_quoted, p_child.children[1].tok) if typ == grammar_nt.sq_string: return cast(single_quoted, p_child.children[1].tok) if typ == grammar_nt.simple_var_sub: return simple_var_sub(children[0].tok) if typ == grammar_nt.char_literal: return children[0].tok raise NotImplementedError(typ) else: tok = children[0].tok # Special punctuation if tok.id in (Id.Expr_Dot, Id.Arith_Caret, Id.Expr_Dollar): return speck(tok.id, tok.span_id) # TODO: d digit can turn into PosixClass and PerlClass right here! # It's parsing. if tok.id == Id.Expr_Name: return self._NameInRegex(None, tok) if tok.id == Id.Expr_Symbol: # Validate symbols here, like we validate PerlClass, etc. if tok.val in ('%start', '%end', 'dot'): return tok p_die("Unexpected token %r in regex", tok.val, token=tok) if tok.id == Id.Expr_At: # | '@' Expr_Name return re.Splice(children[1].tok) if tok.id == Id.Arith_Tilde: # | '~' [Expr_Name | class_literal] typ = children[1].typ if ISNONTERMINAL(typ): ch = children[1].children return re.ClassLiteral(True, self._ClassLiteral(children[1])) else: return self._NameInRegex(tok, children[1].tok) if tok.id == Id.Op_LParen: # | '(' regex ['as' name_type] ')' # TODO: Add variable return re.Group(self._Regex(children[1])) if tok.id == Id.Arith_Colon: # | ':' '(' regex ')' raise NotImplementedError(tok.id) raise NotImplementedError(tok.id)
def _ReadBracedVarSub(self, left_token, d_quoted): # type: (Token, bool) -> braced_var_sub """For the ${} expression language. NAME = [a-zA-Z_][a-zA-Z0-9_]* NUMBER = [0-9]+ # ${10}, ${11}, ... Subscript = '[' ('@' | '*' | ArithExpr) ']' VarSymbol = '!' | '@' | '#' | ... VarOf = NAME Subscript? | NUMBER # no subscript allowed, none of these are arrays # ${@[1]} doesn't work, even though slicing does | VarSymbol TEST_OP = '-' | ':-' | '=' | ':=' | '+' | ':+' | '?' | ':?' STRIP_OP = '#' | '##' | '%' | '%%' CASE_OP = ',' | ',,' | '^' | '^^' UnaryOp = TEST_OP | STRIP_OP | CASE_OP | ... Match = ('/' | '#' | '%') WORD # match all / prefix / suffix VarExpr = VarOf | VarOf UnaryOp WORD | VarOf ':' ArithExpr (':' ArithExpr )? | VarOf '/' Match '/' WORD LengthExpr = '#' VarOf # can't apply operators after length RefOrKeys = '!' VarExpr # CAN apply operators after a named ref # ${!ref[0]} vs ${!keys[@]} resolved later PrefixQuery = '!' NAME ('*' | '@') # list variable names with a prefix VarSub = LengthExpr | RefOrKeys | PrefixQuery | VarExpr NOTES: - Arithmetic expressions are used twice, inside subscripts ${a[x+1]} and slicing ${a:x+1:y+2} - ${#} and ${!} need LL(2) lookahead (considering how my tokenizer works) - @ and * are technically arithmetic expressions in this implementation - We don't account for bash 4.4: ${param@operator} -- Q E P A a. Note that it's also vectorized. Strictness over bash: echo ${a[0][0]} doesn't do anything useful, so we disallow it from the grammar ! and # prefixes can't be composed, even though named refs can be composed with other operators '#' means 4 different things: length prefix, VarSymbol, UnaryOp to strip a prefix, and it can also be a literal part of WORD. From the parser's point of view, the prefix # can't be combined with UnaryOp/slicing/matching, and the ! can. However ${a[@]:1:2} is not allowed ${#a[@]:1:2} is allowed, but gives the wrong answer """ if d_quoted: arg_lex_mode = lex_mode_e.VSub_ArgDQ else: arg_lex_mode = lex_mode_e.VSub_ArgUnquoted self._Next(lex_mode_e.VSub_1) self._Peek() ty = self.token_type if ty == Id.VSub_Pound: # Disambiguate next_id = self.lexer.LookAhead(lex_mode_e.VSub_1) if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace): # e.g. a name, '#' is the prefix self._Next(lex_mode_e.VSub_1) part = self._ParseVarOf() self._Peek() if self.token_type != Id.Right_DollarBrace: p_die('Expected } after length expression', token=self.cur_token) part.prefix_op = speck(ty, self.cur_token.span_id) else: # not a prefix, '#' is the variable part = self._ParseVarExpr(arg_lex_mode) elif ty == Id.VSub_Bang: next_id = self.lexer.LookAhead(lex_mode_e.VSub_1) if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace): # e.g. a name, '!' is the prefix # ${!a} -- this is a ref # ${!3} -- this is ref # ${!a[1]} -- this is a ref # ${!a[@]} -- this is a keys # No lookahead -- do it in a second step, or at runtime self._Next(lex_mode_e.VSub_1) part = self._ParseVarExpr(arg_lex_mode, allow_query=True) part.prefix_op = speck(ty, self.cur_token.span_id) else: # not a prefix, '!' is the variable part = self._ParseVarExpr(arg_lex_mode) # VS_NAME, VS_NUMBER, symbol that isn't # or ! elif self.token_kind == Kind.VSub: part = self._ParseVarExpr(arg_lex_mode) else: # e.g. ${^} p_die('Unexpected token in ${}', token=self.cur_token) part.spids.append(left_token.span_id) # Does this work? right_spid = self.cur_token.span_id part.spids.append(right_spid) return part