Esempio n. 1
0
def _NextUtf8Char(s, i):
  """
  Given a string and a byte offset, returns the byte position of the next char.
  Validates UTF-8.
  """
  byte_as_int = ord(s[i])  # Should never raise IndexError

  try:
    if (byte_as_int >> 7) == 0b0:
      i += 1
    elif (byte_as_int >> 5) == 0b110:
      _CheckContinuationByte(s[i+1])
      i += 2
    elif (byte_as_int >> 4) == 0b1110:
      _CheckContinuationByte(s[i+1])
      _CheckContinuationByte(s[i+2])
      i += 3
    elif (byte_as_int >> 3) == 0b11110:
      _CheckContinuationByte(s[i+1])
      _CheckContinuationByte(s[i+2])
      _CheckContinuationByte(s[i+3])
      i += 4
    else:
      raise util.InvalidUtf8(INVALID_START)
  except IndexError:
    raise util.InvalidUtf8(INCOMPLETE_CHAR)

  return i
Esempio n. 2
0
def _PreviousUtf8Char(s, i):
    # type: (str, int) -> int
    """
  Given a string and a byte offset, returns the position of the
  character before that offset.  To start (find the first byte of the
  last character), pass len(s) for the initial value of i.

  Validates UTF-8.
  """
    # All bytes in a valid UTF-8 string have one of the following formats:
    #
    #   0xxxxxxx (1-byte char)
    #   110xxxxx (start of 2-byte char)
    #   1110xxxx (start of 3-byte char)
    #   11110xxx (start of 4-byte char)
    #   10xxxxxx (continuation byte)
    #
    # Any byte that starts with 10... MUST be a continuation byte,
    # otherwise it must be the start of a character (or just invalid
    # data).
    #
    # Walking backward, we stop at the first non-continuaton byte
    # found.  We try to interpret it as a valid UTF-8 character starting
    # byte, and check that it indicates the correct length, based on how
    # far we've moved from the original byte.  Possible problems:
    #   * byte we stopped on does not have a valid value (e.g., 11111111)
    #   * start byte indicates more or fewer continuation bytes than we've seen
    #   * no start byte at beginning of array
    #
    # Note that because we are going backward, on malformed input, we
    # won't error out in the same place as when parsing the string
    # forwards as normal.
    orig_i = i

    while i > 0:
        i -= 1
        byte_as_int = ord(s[i])
        if (byte_as_int >> 6) != 0b10:
            offset = orig_i - i
            if offset != _Utf8CharLen(byte_as_int):
                # Leaving a generic error for now, but if we want to, it's not
                # hard to calculate the position where things go wrong.  Note
                # that offset might be more than 4, for an invalid utf-8 string.
                raise util.InvalidUtf8(INVALID_START)
            return i

    raise util.InvalidUtf8(INVALID_START)
Esempio n. 3
0
def _Utf8CharLen(starting_byte):
    # type: (int) -> int
    if (starting_byte >> 7) == 0b0:
        return 1
    elif (starting_byte >> 5) == 0b110:
        return 2
    elif (starting_byte >> 4) == 0b1110:
        return 3
    elif (starting_byte >> 3) == 0b11110:
        return 4
    else:
        raise util.InvalidUtf8(INVALID_START)
Esempio n. 4
0
def _NextUtf8Char(s, i):
    # type: (str, int) -> int
    """
  Given a string and a byte offset, returns the byte position after
  the character at this position.  Usually this is the position of the
  next character, but for the last character in the string, it's the
  position just past the end of the string.

  Validates UTF-8.
  """
    byte_as_int = ord(s[i])  # Should never raise IndexError

    try:
        length = _Utf8CharLen(byte_as_int)
        for j in xrange(i + 1, i + length):
            _CheckContinuationByte(s[j])
        i += length
    except IndexError:
        raise util.InvalidUtf8(INCOMPLETE_CHAR)

    return i
Esempio n. 5
0
def _CheckContinuationByte(byte):
  if (ord(byte) >> 6) != 0b10:
    raise util.InvalidUtf8(INVALID_CONT)
Esempio n. 6
0
def _CheckContinuationByte(byte):
    # type: (str) -> None
    if (ord(byte) >> 6) != 0b10:
        raise util.InvalidUtf8(INVALID_CONT)