def _Utf8CharLen(starting_byte): # type: (int) -> int if (starting_byte >> 7) == 0b0: return 1 elif (starting_byte >> 5) == 0b110: return 2 elif (starting_byte >> 4) == 0b1110: return 3 elif (starting_byte >> 3) == 0b11110: return 4 else: e_strict(INVALID_START)
def PreviousUtf8Char(s, i): # type: (str, int) -> int """ Given a string and a byte offset, returns the position of the character before that offset. To start (find the first byte of the last character), pass len(s) for the initial value of i. Validates UTF-8. """ # All bytes in a valid UTF-8 string have one of the following formats: # # 0xxxxxxx (1-byte char) # 110xxxxx (start of 2-byte char) # 1110xxxx (start of 3-byte char) # 11110xxx (start of 4-byte char) # 10xxxxxx (continuation byte) # # Any byte that starts with 10... MUST be a continuation byte, # otherwise it must be the start of a character (or just invalid # data). # # Walking backward, we stop at the first non-continuaton byte # found. We try to interpret it as a valid UTF-8 character starting # byte, and check that it indicates the correct length, based on how # far we've moved from the original byte. Possible problems: # * byte we stopped on does not have a valid value (e.g., 11111111) # * start byte indicates more or fewer continuation bytes than we've seen # * no start byte at beginning of array # # Note that because we are going backward, on malformed input, we # won't error out in the same place as when parsing the string # forwards as normal. orig_i = i while i > 0: i -= 1 byte_as_int = ord(s[i]) if (byte_as_int >> 6) != 0b10: offset = orig_i - i if offset != _Utf8CharLen(byte_as_int): # Leaving a generic error for now, but if we want to, it's not # hard to calculate the position where things go wrong. Note # that offset might be more than 4, for an invalid utf-8 string. e_strict(INVALID_START) return i e_strict(INVALID_START)
def _ValToIntOrError(self, val, span_id=runtime.NO_SPID): # type: (value_t, int) -> int try: UP_val = val with tagswitch(val) as case: if case(value_e.Undef ): # 'nounset' already handled before got here # Happens upon a[undefined]=42, which unfortunately turns into a[0]=42. #log('blame_word %s arena %s', blame_word, self.arena) e_strict('Undefined value in arithmetic context', span_id=span_id) elif case(value_e.Int): val = cast(value__Int, UP_val) return val.i elif case(value_e.Str): val = cast(value__Str, UP_val) return self._StringToInteger( val.s, span_id=span_id) # calls e_strict elif case(value_e.Obj): # Note: this handles var x = 42; echo $(( x > 2 )). if mylib.PYTHON: val = cast(value__Obj, UP_val) if isinstance(val.obj, int): return val.obj raise AssertionError() # not in C++ except error.Strict as e: if self.exec_opts.strict_arith(): raise else: return 0 # Arrays and associative arrays always fail -- not controlled by # strict_arith. # In bash, (( a )) is like (( a[0] )), but I don't want that. # And returning '0' gives different results. e_die("Expected a value convertible to integer, got %s", ui.ValType(val), span_id=span_id)
def _NextUtf8Char(s, i): # type: (str, int) -> int """ Given a string and a byte offset, returns the byte position after the character at this position. Usually this is the position of the next character, but for the last character in the string, it's the position just past the end of the string. Validates UTF-8. """ n = len(s) assert i < n, i # should always be in range byte_as_int = ord(s[i]) length = _Utf8CharLen(byte_as_int) for j in xrange(i + 1, i + length): if j >= n: e_strict(INCOMPLETE_CHAR) _CheckContinuationByte(s[j]) return i + length
def _NextUtf8Char(s, i): # type: (str, int) -> int """ Given a string and a byte offset, returns the byte position after the character at this position. Usually this is the position of the next character, but for the last character in the string, it's the position just past the end of the string. Validates UTF-8. """ byte_as_int = ord(s[i]) # Should never raise IndexError try: length = _Utf8CharLen(byte_as_int) for j in xrange(i + 1, i + length): _CheckContinuationByte(s[j]) i += length except IndexError: e_strict(INCOMPLETE_CHAR) return i
def _StringToInteger(self, s, span_id=runtime.NO_SPID): # type: (str, int) -> int """Use bash-like rules to coerce a string to an integer. Runtime parsing enables silly stuff like $(( $(echo 1)$(echo 2) + 1 )) => 13 0xAB -- hex constant 042 -- octal constant 42 -- decimal constant 64#z -- arbitary base constant bare word: variable quoted word: string (not done?) """ if s.startswith('0x'): try: integer = int(s, 16) except ValueError: e_strict('Invalid hex constant %r', s, span_id=span_id) return integer if s.startswith('0'): try: integer = int(s, 8) except ValueError: e_strict('Invalid octal constant %r', s, span_id=span_id) return integer if '#' in s: b, digits = mylib.split_once(s, '#') try: base = int(b) except ValueError: e_strict('Invalid base for numeric constant %r', b, span_id=span_id) integer = 0 for ch in digits: if IsLower(ch): digit = ord(ch) - ord('a') + 10 elif IsUpper(ch): digit = ord(ch) - ord('A') + 36 elif ch == '@': # horrible syntax digit = 62 elif ch == '_': digit = 63 elif ch.isdigit(): digit = int(ch) else: e_strict('Invalid digits for numeric constant %r', digits, span_id=span_id) if digit >= base: e_strict('Digits %r out of range for base %d', digits, base, span_id=span_id) integer = integer * base + digit return integer try: # Normal base 10 integer. This includes negative numbers like '-42'. integer = int(s) except ValueError: # doesn't look like an integer # note: 'test' and '[' never evaluate recursively if self.exec_opts.eval_unsafe_arith() and self.parse_ctx: # Special case so we don't get EOF error if len(s.strip()) == 0: return 0 # For compatibility: Try to parse it as an expression and evaluate it. arena = self.parse_ctx.arena a_parser = self.parse_ctx.MakeArithParser(s) with alloc.ctx_Location(arena, source.Variable(span_id)): try: node2 = a_parser.Parse() # may raise error.Parse except error.Parse as e: ui.PrettyPrintError(e, arena) e_die('Parse error in recursive arithmetic', span_id=e.span_id) # Prevent infinite recursion of $(( 1x )) -- it's a word that evaluates # to itself, and you don't want to reparse it as a word. if node2.tag_() == arith_expr_e.Word: e_die("Invalid integer constant %r", s, span_id=span_id) else: integer = self.EvalToInt(node2) else: if len(s.strip()) == 0 or match.IsValidVarName(s): # x42 could evaluate to 0 e_strict("Invalid integer constant %r", s, span_id=span_id) else: # 42x is always fatal! e_die("Invalid integer constant %r", s, span_id=span_id) return integer
def _CheckContinuationByte(byte): # type: (str) -> None if (ord(byte) >> 6) != 0b10: e_strict(INVALID_CONT)