def BoolId(w): # type: (word_t) -> Id_t UP_w = w with tagswitch(w) as case: if case(word_e.String): # for test/[ w = cast(word__String, UP_w) return w.id elif case(word_e.Token): tok = cast(Token, UP_w) return tok.id elif case(word_e.Compound): w = cast(compound_word, UP_w) if len(w.parts) != 1: return Id.Word_Compound token_type = _LiteralId(w.parts[0]) if token_type == Id.Undefined_Tok: return Id.Word_Compound # It's a regular word # This is outside the BoolUnary/BoolBinary namespace, but works the same. if token_type in (Id.KW_Bang, Id.Lit_DRightBracket): return token_type # special boolean "tokens" token_kind = consts.GetKind(token_type) if token_kind in (Kind.BoolUnary, Kind.BoolBinary): return token_type # boolean operators return Id.Word_Compound else: # I think Empty never happens in this context? raise AssertionError(w.tag_())
def CommandId(w): # type: (word_t) -> Id_t UP_w = w with tagswitch(w) as case: if case(word_e.Token): tok = cast(Token, UP_w) return tok.id elif case(word_e.Compound): w = cast(compound_word, UP_w) # Has to be a single literal part if len(w.parts) != 1: return Id.Word_Compound token_type = _LiteralId(w.parts[0]) if token_type == Id.Undefined_Tok: return Id.Word_Compound elif token_type in (Id.Lit_LBrace, Id.Lit_RBrace, Id.Lit_Equals, Id.ControlFlow_Return): # OSH and Oil recognize: { } # Oil recognizes: = return return token_type token_kind = consts.GetKind(token_type) if token_kind == Kind.KW: return token_type return Id.Word_Compound else: raise AssertionError(w.tag_())
def _Next(self, lex_mode): # type: (lex_mode_t) -> None """Set the next lex state, but don't actually read a token. We need this for proper interactive parsing. """ self.cur_token = self.lexer.Read(lex_mode) self.token_type = self.cur_token.id self.token_kind = consts.GetKind(self.token_type)
def _Peek(self): # type: () -> None """Helper method.""" if self.next_lex_mode != lex_mode_e.Undefined: self.cur_token = self.lexer.Read(self.next_lex_mode) self.token_type = self.cur_token.id self.token_kind = consts.GetKind(self.token_type) self.parse_ctx.trail.AppendToken(self.cur_token) # For completion self.next_lex_mode = lex_mode_e.Undefined
def testTokens(self): print(Id.Op_Newline) print(Tok(Id.Op_Newline, '\n')) print(Id.Op_Newline) print(Kind.Eof) print(Kind.Left) print('--') num_kinds = 0 for name in dir(Kind): if name[0].isupper(): kind = getattr(Kind, name) print('%-20s %s' % (name, kind)) num_kinds += 1 print() print('Number of Kinds:', num_kinds) print() for name in dir(Id): if name[0].isupper(): id_ = getattr(Id, name) print('%-30s %s' % (name, id_)) # 309 out of 256 tokens now print() print('Number of IDs:', len(ID_SPEC.id_str2int)) t = Tok(Id.Arith_Plus, '+') self.assertEqual(Kind.Arith, consts.GetKind(t.id)) t = Tok(Id.Arith_CaretEqual, '^=') self.assertEqual(Kind.Arith, consts.GetKind(t.id)) t = Tok(Id.Arith_RBrace, '}') self.assertEqual(Kind.Arith, consts.GetKind(t.id)) t = Tok(Id.BoolBinary_GlobDEqual, '==') self.assertEqual(Kind.BoolBinary, consts.GetKind(t.id)) t = Tok(Id.BoolBinary_Equal, '=') self.assertEqual(Kind.BoolBinary, consts.GetKind(t.id))
def CommandKind(w): # type: (word_t) -> Kind_t """The CommandKind is for coarse-grained decisions in the CommandParser.""" if w.tag_() == word_e.Token: tok = cast(Token, w) return consts.GetKind(tok.id) # NOTE: This is a bit inconsistent with CommandId, because we never # return Kind.KW (or Kind.Lit). But the CommandParser is easier to write # this way. return Kind.Word
def KeywordToken(w): # type: (compound_word) -> Tuple[Kind_t, Optional[Token]] """Tests if a word is an assignment or control flow word.""" no_token = None # type: Optional[Token] if len(w.parts) != 1: return Kind.Undefined, no_token UP_part0 = w.parts[0] token_type = _LiteralId(UP_part0) if token_type == Id.Undefined_Tok: return Kind.Undefined, no_token token_kind = consts.GetKind(token_type) if token_kind == Kind.ControlFlow: return token_kind, cast(Token, UP_part0) return Kind.Undefined, no_token
def _NextOne(self, lex_mode=lex_mode_e.DBracket): # type: (lex_mode_t) -> None n = len(self.words) if n == 2: assert lex_mode == lex_mode_e.DBracket self.words[0] = self.words[1] self.cur_word = self.words[0] self.words.pop() elif n in (0, 1): w = self.w_parser.ReadWord(lex_mode) # may raise if n == 0: self.words.append(w) else: self.words[0] = w self.cur_word = w assert self.cur_word is not None self.op_id = word_.BoolId(self.cur_word) self.b_kind = consts.GetKind(self.op_id)
def Parse(lexer): # type: (Lexer) -> List[Token] """Given a QSN literal in a string, return the corresponding byte string. Grammar: qsn = SingleQuote Kind.Char* SingleQuote Whitespace? Eof_Real """ tok = lexer.Read(lex_mode_e.QSN) # Caller ensures this. It's really a left single quote. assert tok.id == Id.Right_SingleQuote result = [] # type: List[Token] while True: tok = lexer.Read(lex_mode_e.QSN) #log('tok = %s', tok) if tok.id == Id.Unknown_Tok: # extra error p_die('Unexpected token in QSN string', token=tok) kind = consts.GetKind(tok.id) if kind != Kind.Char: break result.append(tok) if tok.id != Id.Right_SingleQuote: p_die('Expected closing single quote in QSN string', token=tok) # HACK: read in shell's SQ_C mode to get whitespace, which is disallowe # INSIDE QSN. This gets Eof_Real too. tok = lexer.Read(lex_mode_e.SQ_C) # Doesn't work because we want to allow literal newlines / tabs if tok.id == Id.Char_Literals: if not _IsWhitespace(tok.val): p_die("Unexpected data after closing quote", token=tok) tok = lexer.Read(lex_mode_e.QSN) if tok.id != Id.Eof_Real: p_die('Unexpected token after QSN string', token=tok) return result
def Read(self, lex_mode): # type: (lex_mode_t) -> Token # Inner loop optimization line = self.line line_pos = self.line_pos tok_type, end_pos = match.OneToken(lex_mode, line, line_pos) if tok_type == Id.Eol_Tok: # Do NOT add a span for this sentinel! return _EOL_TOK # Save on allocations! We often don't look at the token value. # TODO: can inline this function with formula on 16-bit Id. kind = consts.GetKind(tok_type) # Whitelist doesn't work well? Use blacklist for now. # - Kind.KW is sometimes a literal in a word # - Kind.Right is for " in here docs. Lexer isn't involved. # - Got an error with Kind.Left too that I don't understand # if kind in (Kind.Lit, Kind.VSub, Kind.Redir, Kind.Char, Kind.Backtick, Kind.KW, Kind.Right): if kind in (Kind.Arith, Kind.Op, Kind.WS, Kind.Ignored, Kind.Eof): tok_val = None # type: Optional[str] else: tok_val = line[line_pos:end_pos] # NOTE: We're putting the arena hook in LineLexer and not Lexer because we # want it to be "low level". The only thing fabricated here is a newline # added at the last line, so we don't end with \0. if self.arena_skip: # make another token from the last span assert self.last_span_id != runtime.NO_SPID span_id = self.last_span_id self.arena_skip = False else: tok_len = end_pos - line_pos span_id = self.arena.AddLineSpan(self.line_id, line_pos, tok_len) self.last_span_id = span_id #log('LineLexer.Read() span ID %d for %s', span_id, tok_type) t = Token(tok_type, span_id, tok_val) self.line_pos = end_pos return t
def _MaybeReplaceLeaf(self, node): # type: (re_t) -> Tuple[Optional[re_t], bool] """ If a leaf node needs to be evaluated, do it and return the replacement. Otherwise return None. """ new_leaf = None recurse = True if node.tag == re_e.Speck: id_ = node.id if id_ == Id.Expr_Dot: new_leaf = re.Primitive(Id.Re_Dot) elif id_ == Id.Arith_Caret: # ^ new_leaf = re.Primitive(Id.Re_Start) elif id_ == Id.Expr_Dollar: # $ new_leaf = re.Primitive(Id.Re_End) else: raise NotImplementedError(id_) elif node.tag == re_e.Token: id_ = node.id val = node.val if id_ == Id.Expr_Name: if val == 'dot': new_leaf = re.Primitive(Id.Re_Dot) else: raise NotImplementedError(val) elif id_ == Id.Expr_Symbol: if val == '%start': new_leaf = re.Primitive(Id.Re_Start) elif val == '%end': new_leaf = re.Primitive(Id.Re_End) else: raise NotImplementedError(val) else: # Must be Id.Char_{OneChar,Hex,Unicode4,Unicode8} kind = consts.GetKind(id_) assert kind == Kind.Char, id_ s = word_compile.EvalCStringToken(id_, val) new_leaf = re.LiteralChars(s, node.span_id) elif node.tag == re_e.SingleQuoted: s = word_eval.EvalSingleQuoted(node) new_leaf = re.LiteralChars(s, node.left.span_id) elif node.tag == re_e.DoubleQuoted: s = self.word_ev.EvalDoubleQuotedToString(node) new_leaf = re.LiteralChars(s, node.left.span_id) elif node.tag == re_e.BracedVarSub: s = self.word_ev.EvalBracedVarSubToString(node) new_leaf = re.LiteralChars(s, node.spids[0]) elif node.tag == re_e.SimpleVarSub: s = self.word_ev.EvalSimpleVarSubToString(node.token) new_leaf = re.LiteralChars(s, node.token.span_id) elif node.tag == re_e.Splice: obj = self.LookupVar(node.name.val) if not isinstance(obj, objects.Regex): e_die("Can't splice object of type %r into regex", obj.__class__, token=node.name) # Note: we only splice the regex, and ignore flags. # Should we warn about this? new_leaf = obj.regex # These are leaves we don't need to do anything with. elif node.tag == re_e.PosixClass: recurse = False elif node.tag == re_e.PerlClass: recurse = False return new_leaf, recurse
def ParseFactor(self): # type: () -> bool_expr_t """ Factor : WORD | UNARY_OP WORD | WORD BINARY_OP WORD | '(' Expr ')' """ if self.b_kind == Kind.BoolUnary: # Just save the type and not the token itself? op = self.op_id self._Next() w = self.cur_word # e.g. [[ -f < ]]. But [[ -f '<' ]] is OK tag = w.tag_() if tag != word_e.Compound and tag != word_e.String: p_die('Invalid argument to unary operator', word=w) self._Next() node = bool_expr.Unary(op, w) # type: bool_expr_t return node if self.b_kind == Kind.Word: # Peek ahead another token. t2 = self._LookAhead() t2_op_id = word_.BoolId(t2) t2_b_kind = consts.GetKind(t2_op_id) #log('t2 %s / t2_op_id %s / t2_b_kind %s', t2, t2_op_id, t2_b_kind) # Op for < and >, -a and -o pun if t2_b_kind == Kind.BoolBinary or t2_op_id in (Id.Op_Less, Id.Op_Great): left = self.cur_word self._Next() op = self.op_id # TODO: Need to change to lex_mode_e.BashRegex. # _Next(lex_mode) then? is_regex = t2_op_id == Id.BoolBinary_EqualTilde if is_regex: self._Next(lex_mode=lex_mode_e.BashRegex) else: self._Next() right = self.cur_word if is_regex: # NOTE: StaticEval for checking regex syntax isn't enough. We could # need to pass do_ere so that the quoted parts get escaped. #ok, s, unused_quoted = word_.StaticEval(right) pass self._Next() return bool_expr.Binary(op, left, right) else: # [[ foo ]] w = self.cur_word self._Next() return bool_expr.WordTest(w) if self.op_id == Id.Op_LParen: self._Next() node = self.ParseExpr() if self.op_id != Id.Op_RParen: p_die('Expected ), got %s', word_.Pretty(self.cur_word), word=self.cur_word) self._Next() return node # It's not WORD, UNARY_OP, or '(' p_die('Unexpected token in boolean expression', word=self.cur_word)
def _PushOilTokens(parse_ctx, gr, p, lex): # type: (ParseContext, Grammar, parse.Parser, Lexer) -> Token """Push tokens onto pgen2's parser. Returns the last token so it can be reused/seen by the CommandParser. """ #log('keywords = %s', gr.keywords) #log('tokens = %s', gr.tokens) last_token = None # type: Optional[Token] prev_was_newline = False balance = 0 # to ignore newlines while True: if last_token: # e.g. left over from WordParser tok = last_token #log('last_token = %s', last_token) last_token = None else: tok = lex.Read(lex_mode_e.Expr) #log('tok = %s', tok) # Comments and whitespace. Newlines aren't ignored. if consts.GetKind(tok.id) == Kind.Ignored: continue # For multiline lists, maps, etc. if tok.id == Id.Op_Newline: if balance > 0: #log('*** SKIPPING NEWLINE') continue # Eliminate duplicate newline tokens. It makes the grammar simpler, and # it's consistent with CPython's lexer and our own WordParser. if prev_was_newline: continue prev_was_newline = True else: prev_was_newline = False balance += _OTHER_BALANCE.get(tok.id, 0) #log('BALANCE after seeing %s = %d', tok.id, balance) #if tok.id == Id.Expr_Name and tok.val in KEYWORDS: # tok.id = KEYWORDS[tok.val] # log('Replaced with %s', tok.id) assert tok.id < 256, Id_str(tok.id) ilabel = _Classify(gr, tok) #log('tok = %s, ilabel = %d', tok, ilabel) if p.addtoken(tok.id, tok, ilabel): return tok # # Mututally recursive calls into the command/word parsers. # if mylib.PYTHON: if tok.id == Id.Left_PercentParen: # %( left_tok = tok lex.PushHint(Id.Op_RParen, Id.Right_ShArrayLiteral) # Blame the opening token line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok) w_parser = parse_ctx.MakeWordParser(lex, line_reader) words = [] close_tok = None # type: Optional[Token] while True: w = w_parser.ReadWord(lex_mode_e.ShCommand) if 0: log('w = %s', w) if w.tag_() == word_e.Token: tok = cast(Token, w) if tok.id == Id.Right_ShArrayLiteral: close_tok = tok break elif tok.id == Id.Op_Newline: # internal newlines allowed continue else: # Token p_die('Unexpected token in array literal: %r', tok.val, word=w) assert isinstance(w, compound_word) # for MyPy words.append(w) words2 = braces.BraceDetectAll(words) words3 = word_.TildeDetectAll(words2) typ = Id.Expr_CastedDummy lit_part = sh_array_literal(left_tok, words3) opaque = cast(Token, lit_part) # HACK for expr_to_ast done = p.addtoken(typ, opaque, gr.tokens[typ]) assert not done # can't end the expression # Now push the closing ) ilabel = _Classify(gr, close_tok) done = p.addtoken(tok.id, close_tok, ilabel) assert not done # can't end the expression continue # $( @( &( if tok.id in (Id.Left_DollarParen, Id.Left_AtParen, Id.Left_AmpParen): left_token = tok lex.PushHint(Id.Op_RParen, Id.Eof_RParen) line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok) c_parser = parse_ctx.MakeParserForCommandSub( line_reader, lex, Id.Eof_RParen) node = c_parser.ParseCommandSub() # A little gross: Copied from osh/word_parse.py right_token = c_parser.w_parser.cur_token cs_part = command_sub(left_token, node) cs_part.spids.append(left_token.span_id) cs_part.spids.append(right_token.span_id) typ = Id.Expr_CastedDummy opaque = cast(Token, cs_part) # HACK for expr_to_ast done = p.addtoken(typ, opaque, gr.tokens[typ]) assert not done # can't end the expression # Now push the closing ) ilabel = _Classify(gr, right_token) done = p.addtoken(right_token.id, right_token, ilabel) assert not done # can't end the expression continue if tok.id == Id.Left_DoubleQuote: left_token = tok line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok) w_parser = parse_ctx.MakeWordParser(lex, line_reader) parts = [] # type: List[word_part_t] last_token = w_parser.ReadDoubleQuoted(left_token, parts) expr_dq_part = double_quoted(left_token, parts) typ = Id.Expr_CastedDummy opaque = cast(Token, expr_dq_part) # HACK for expr_to_ast done = p.addtoken(typ, opaque, gr.tokens[typ]) assert not done # can't end the expression continue if tok.id == Id.Left_DollarBrace: left_token = tok line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok) w_parser = parse_ctx.MakeWordParser(lex, line_reader) part, last_token = w_parser.ReadBracedVarSub(left_token) # It's casted word_part__BracedVarSub -> dummy -> expr__BracedVarSub! typ = Id.Expr_CastedDummy opaque = cast(Token, part) # HACK for expr_to_ast done = p.addtoken(typ, opaque, gr.tokens[typ]) assert not done # can't end the expression continue # '' and r'' and c'' if tok.id in (Id.Left_SingleQuote, Id.Left_RSingleQuote, Id.Left_CSingleQuote): if tok.id == Id.Left_CSingleQuote: sq_mode = lex_mode_e.SQ_C else: sq_mode = lex_mode_e.SQ_Raw left_token = tok line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok) w_parser = parse_ctx.MakeWordParser(lex, line_reader) tokens = [] # type: List[Token] last_token = w_parser.ReadSingleQuoted(sq_mode, left_token, tokens, True) sq_part = single_quoted(left_token, tokens) typ = Id.Expr_CastedDummy opaque = cast(Token, sq_part) # HACK for expr_to_ast done = p.addtoken(typ, opaque, gr.tokens[typ]) assert not done # can't end the expression continue else: # We never broke out -- EOF is too soon (how can this happen???) raise parse.ParseError("incomplete input", tok.id, tok)
def testMode_DBracket(self): lex = _InitLexer('-z foo') t = lex.Read(lex_mode_e.DBracket) self.assertTokensEqual(Tok(Id.BoolUnary_z, '-z'), t) self.assertEqual(Kind.BoolUnary, consts.GetKind(t.id))