Ejemplo n.º 1
0
def _tm_to_tuple(space, t):
    time_tuple = [
        space.newint(rffi.getintfield(t, 'c_tm_year') + 1900),
        space.newint(rffi.getintfield(t, 'c_tm_mon') + 1),  # want january == 1
        space.newint(rffi.getintfield(t, 'c_tm_mday')),
        space.newint(rffi.getintfield(t, 'c_tm_hour')),
        space.newint(rffi.getintfield(t, 'c_tm_min')),
        space.newint(rffi.getintfield(t, 'c_tm_sec')),
        space.newint(
            (rffi.getintfield(t, 'c_tm_wday') + 6) % 7),  # want monday == 0
        space.newint(rffi.getintfield(t, 'c_tm_yday') +
                     1),  # want january, 1 == 1
        space.newint(rffi.getintfield(t, 'c_tm_isdst'))
    ]

    if HAS_TM_ZONE:
        # CPython calls PyUnicode_DecodeLocale here should we do the same?
        tm_zone = decode_utf8(space,
                              rffi.charp2str(t.c_tm_zone),
                              allow_surrogates=True)
        extra = [
            space.newunicode(tm_zone),
            space.newint(rffi.getintfield(t, 'c_tm_gmtoff'))
        ]
        w_time_tuple = space.newtuple(time_tuple + extra)
    else:
        w_time_tuple = space.newtuple(time_tuple)
    w_struct_time = _get_module_object(space, 'struct_time')
    w_obj = space.call_function(w_struct_time, w_time_tuple)
    return w_obj
Ejemplo n.º 2
0
 def decode_string_escaped(self, start):
     i = self.pos
     builder = StringBuilder((i - start) * 2)  # just an estimate
     assert start >= 0
     assert i >= 0
     builder.append_slice(self.s, start, i)
     while True:
         ch = self.ll_chars[i]
         i += 1
         if ch == '"':
             content_utf8 = builder.build()
             content_unicode = unicodehelper.decode_utf8(
                 self.space, content_utf8)
             self.last_type = TYPE_STRING
             self.pos = i
             return self.space.newunicode(content_unicode)
         elif ch == '\\':
             i = self.decode_escape_sequence(i, builder)
         elif ch < '\x20':
             if ch == '\0':
                 self._raise("Unterminated string starting at char %d",
                             start - 1)
             else:
                 self._raise("Invalid control character at char %d", i - 1)
         else:
             builder.append(ch)
Ejemplo n.º 3
0
    def w_convert(self, space, s):
        if self.returns_unicode:
            from pypy.interpreter.unicodehelper import decode_utf8

            return space.wrap(decode_utf8(space, s))
        else:
            return space.wrap(s)
Ejemplo n.º 4
0
 def decode_string(self, i):
     start = i
     bits = 0
     while True:
         # this loop is a fast path for strings which do not contain escape
         # characters
         ch = self.ll_chars[i]
         i += 1
         bits |= ord(ch)
         if ch == '"':
             if bits & 0x80:
                 # the 8th bit is set, it's an utf8 strnig
                 content_utf8 = self.getslice(start, i-1)
                 content_unicode = unicodehelper.decode_utf8(self.space, content_utf8)
             else:
                 # ascii only, fast path (ascii is a strict subset of
                 # latin1, and we already checked that all the chars are <
                 # 128)
                 content_unicode = strslice2unicode_latin1(self.s, start, i-1)
             self.last_type = TYPE_STRING
             self.pos = i
             return self.space.wrap(content_unicode)
         elif ch == '\\':
             content_so_far = self.getslice(start, i-1)
             self.pos = i-1
             return self.decode_string_escaped(start, content_so_far)
         elif ch < '\x20':
             self._raise("Invalid control character at char %d", self.pos-1)
Ejemplo n.º 5
0
 def decode_string(self, i):
     start = i
     bits = 0
     while True:
         # this loop is a fast path for strings which do not contain escape
         # characters
         ch = self.ll_chars[i]
         i += 1
         bits |= ord(ch)
         if ch == '"':
             if bits & 0x80:
                 # the 8th bit is set, it's an utf8 strnig
                 content_utf8 = self.getslice(start, i - 1)
                 content_unicode = unicodehelper.decode_utf8(
                     self.space, content_utf8)
             else:
                 # ascii only, fast path (ascii is a strict subset of
                 # latin1, and we already checked that all the chars are <
                 # 128)
                 content_unicode = strslice2unicode_latin1(
                     self.s, start, i - 1)
             self.last_type = TYPE_STRING
             self.pos = i
             return self.space.newunicode(content_unicode)
         elif ch == '\\' or ch < '\x20':
             self.pos = i - 1
             return self.decode_string_escaped(start)
Ejemplo n.º 6
0
def decode_utf8(space, s, ps, end):
    assert ps >= 0
    pt = ps
    # while (s < end && *s != '\\') s++; */ /* inefficient for u".."
    while ps < end and ord(s[ps]) & 0x80:
        ps += 1
    u = unicodehelper.decode_utf8(space, s[pt:ps])
    return u, ps
Ejemplo n.º 7
0
def decode_utf8(space, s, ps, end):
    assert ps >= 0
    pt = ps
    # while (s < end && *s != '\\') s++; */ /* inefficient for u".."
    while ps < end and ord(s[ps]) & 0x80:
        ps += 1
    u = unicodehelper.decode_utf8(space, s[pt:ps])
    return u, ps
Ejemplo n.º 8
0
 def _create_string(self, start, end, bits):
     if bits & 0x80:
         # the 8th bit is set, it's an utf8 string
         content_utf8 = self.getslice(start, end)
         return unicodehelper.decode_utf8(self.space, content_utf8)
     else:
         # ascii only, fast path (ascii is a strict subset of
         # latin1, and we already checked that all the chars are <
         # 128)
         return strslice2unicode_latin1(self.s, start, end)
Ejemplo n.º 9
0
def decode_utf8(space, s, ps, end, encoding):
    assert ps >= 0
    pt = ps
    # while (s < end && *s != '\\') s++; */ /* inefficient for u".."
    while ps < end and ord(s[ps]) & 0x80:
        ps += 1
    w_u = space.wrap(unicodehelper.decode_utf8(space, s[pt:ps]))
    w_v = unicodehelper.encode(space, w_u, encoding)
    v = space.str_w(w_v)
    return v, ps
Ejemplo n.º 10
0
def test_decode_utf8():
    space = FakeSpace()
    assert decode_utf8(space, "abc") == u"abc"
    assert decode_utf8(space, "\xe1\x88\xb4") == u"\u1234"
    assert decode_utf8(space, "\xed\xa0\x80") == u"\ud800"
    assert decode_utf8(space, "\xed\xb0\x80") == u"\udc00"
    got = decode_utf8(space, "\xed\xa0\x80\xed\xb0\x80")
    assert map(ord, got) == [0xd800, 0xdc00]
    got = decode_utf8(space, "\xf0\x90\x80\x80")
    assert map(ord, got) == [0x10000]
Ejemplo n.º 11
0
 def decode_string_escaped(self, start, content_so_far):
     builder = StringBuilder(len(content_so_far)*2) # just an estimate
     builder.append(content_so_far)
     i = self.pos
     while True:
         ch = self.ll_chars[i]
         i += 1
         if ch == '"':
             content_utf8 = builder.build()
             content_unicode = unicodehelper.decode_utf8(self.space, content_utf8)
             self.last_type = TYPE_STRING
             self.pos = i
             return self.space.wrap(content_unicode)
         elif ch == '\\':
             i = self.decode_escape_sequence(i, builder)
         elif ch == '\0':
             self._raise("Unterminated string starting at char %d", start)
         else:
             builder.append_multiple_char(ch, 1) # we should implement append_char
Ejemplo n.º 12
0
 def decode_string_escaped(self, start, content_so_far):
     builder = StringBuilder(len(content_so_far) * 2)  # just an estimate
     builder.append(content_so_far)
     i = self.pos
     while True:
         ch = self.ll_chars[i]
         i += 1
         if ch == '"':
             content_utf8 = builder.build()
             content_unicode = unicodehelper.decode_utf8(
                 self.space, content_utf8)
             self.last_type = TYPE_STRING
             self.pos = i
             return self.space.wrap(content_unicode)
         elif ch == '\\':
             i = self.decode_escape_sequence(i, builder)
         elif ch == '\0':
             self._raise("Unterminated string starting at char %d", start)
         else:
             builder.append_multiple_char(
                 ch, 1)  # we should implement append_char
Ejemplo n.º 13
0
 def newlist_text(self, list_t):
     return self.newlist_unicode([
         decode_utf8(self, s, allow_surrogates=True) for s in list_t])
Ejemplo n.º 14
0
def parsestr(space, encoding, s):
    """Parses a string or unicode literal, and return a wrapped value.

    If encoding=None, the source string is ascii only.
    In other cases, the source string is in utf-8 encoding.

    When a bytes string is returned, it will be encoded with the
    original encoding.

    Yes, it's very inefficient.
    Yes, CPython has very similar code.
    """
    # we use ps as "pointer to s"
    # q is the virtual last char index of the string
    ps = 0
    quote = s[ps]
    rawmode = False
    unicode_literal = True
    saw_u = False

    # string decoration handling
    if quote == "b" or quote == "B":
        ps += 1
        quote = s[ps]
        unicode_literal = False
    elif quote == "u" or quote == "U":
        ps += 1
        quote = s[ps]
        saw_u = True
    if not saw_u and quote == "r" or quote == "R":
        ps += 1
        quote = s[ps]
        rawmode = True
    if quote != "'" and quote != '"':
        raise_app_valueerror(space, "Internal error: parser passed unquoted literal")
    ps += 1
    q = len(s) - 1
    if s[q] != quote:
        raise_app_valueerror(space, "Internal error: parser passed unmatched " "quotes in literal")
    if q - ps >= 4 and s[ps] == quote and s[ps + 1] == quote:
        # triple quotes
        ps += 2
        if s[q - 1] != quote or s[q - 2] != quote:
            raise_app_valueerror(space, "Internal error: parser passed " "unmatched triple quotes in literal")
        q -= 2

    if unicode_literal and not rawmode:  # XXX Py_UnicodeFlag is ignored for now
        if encoding is None:
            assert 0 <= ps <= q
            substr = s[ps:q]
        else:
            substr = decode_unicode_utf8(space, s, ps, q)
        v = unicodehelper.decode_unicode_escape(space, substr)
        return space.wrap(v)

    assert 0 <= ps <= q
    substr = s[ps:q]

    if not unicode_literal:
        # Disallow non-ascii characters (but not escapes)
        for c in substr:
            if ord(c) > 0x80:
                raise OperationError(
                    space.w_SyntaxError, space.wrap("bytes can only contain ASCII literal characters.")
                )

    if rawmode or "\\" not in substr:
        if not unicode_literal:
            return space.wrapbytes(substr)
        else:
            v = unicodehelper.decode_utf8(space, substr)
            return space.wrap(v)

    v = PyString_DecodeEscape(space, substr, "strict", encoding)
    return space.wrapbytes(v)
Ejemplo n.º 15
0
def parsestr(space, encoding, s, unicode_literal=False):
    """Parses a string or unicode literal, and return a wrapped value.

    If encoding=iso8859-1, the source string is also in this encoding.
    If encoding=None, the source string is ascii only.
    In other cases, the source string is in utf-8 encoding.

    When a bytes string is returned, it will be encoded with the
    original encoding.

    Yes, it's very inefficient.
    Yes, CPython has very similar code.
    """
    # we use ps as "pointer to s"
    # q is the virtual last char index of the string
    ps = 0
    quote = s[ps]
    rawmode = False

    # string decoration handling
    if quote == 'b' or quote == 'B':
        ps += 1
        quote = s[ps]
        unicode_literal = False
    elif quote == 'u' or quote == 'U':
        ps += 1
        quote = s[ps]
        unicode_literal = True
    if quote == 'r' or quote == 'R':
        ps += 1
        quote = s[ps]
        rawmode = True
    if quote != "'" and quote != '"':
        raise_app_valueerror(space,
                             'Internal error: parser passed unquoted literal')
    ps += 1
    q = len(s) - 1
    if s[q] != quote:
        raise_app_valueerror(space, 'Internal error: parser passed unmatched '
                                    'quotes in literal')
    if q-ps >= 4 and s[ps] == quote and s[ps+1] == quote:
        # triple quotes
        ps += 2
        if s[q-1] != quote or s[q-2] != quote:
            raise_app_valueerror(space, 'Internal error: parser passed '
                                        'unmatched triple quotes in literal')
        q -= 2

    if unicode_literal: # XXX Py_UnicodeFlag is ignored for now
        if encoding is None or encoding == "iso-8859-1":
            # 'unicode_escape' expects latin-1 bytes, string is ready.
            assert 0 <= ps <= q
            substr = s[ps:q]
        else:
            substr = decode_unicode_utf8(space, s, ps, q)
        if rawmode:
            v = unicodehelper.decode_raw_unicode_escape(space, substr)
        else:
            v = unicodehelper.decode_unicode_escape(space, substr)
        return space.wrap(v)

    need_encoding = (encoding is not None and
                     encoding != "utf-8" and encoding != "utf8" and
                     encoding != "iso-8859-1")
    assert 0 <= ps <= q
    substr = s[ps : q]
    if rawmode or '\\' not in s[ps:]:
        if need_encoding:
            w_u = space.wrap(unicodehelper.decode_utf8(space, substr))
            w_v = unicodehelper.encode(space, w_u, encoding)
            return w_v
        else:
            return space.wrap(substr)

    enc = None
    if need_encoding:
        enc = encoding
    v = PyString_DecodeEscape(space, substr, 'strict', enc)
    return space.wrap(v)
Ejemplo n.º 16
0
 def newtext(self, s):
     return self.newunicode(decode_utf8(self, s, allow_surrogates=True))
Ejemplo n.º 17
0
def parsestr(space, encoding, s):
    """Parses a string or unicode literal, and return usually
    a wrapped value.  If we get an f-string, then instead return
    an unparsed but unquoted W_FString instance.

    If encoding=None, the source string is ascii only.
    In other cases, the source string is in utf-8 encoding.

    When a bytes string is returned, it will be encoded with the
    original encoding.

    Yes, it's very inefficient.
    Yes, CPython has very similar code.
    """
    # we use ps as "pointer to s"
    # q is the virtual last char index of the string
    ps = 0
    quote = s[ps]
    rawmode = False
    unicode_literal = True
    saw_u = False
    saw_f = False

    # string decoration handling
    if quote == 'b' or quote == 'B':
        ps += 1
        quote = s[ps]
        unicode_literal = False
    elif quote == 'u' or quote == 'U':
        ps += 1
        quote = s[ps]
        saw_u = True
    elif quote == 'r' or quote == 'R':
        ps += 1
        quote = s[ps]
        rawmode = True
    elif quote == 'f' or quote == 'F':
        ps += 1
        quote = s[ps]
        saw_f = True

    if not saw_u:
        if quote == 'r' or quote == 'R':
            ps += 1
            quote = s[ps]
            rawmode = True
        elif quote == 'b' or quote == 'B':
            ps += 1
            quote = s[ps]
            unicode_literal = False
        elif quote == 'f' or quote == 'F':
            ps += 1
            quote = s[ps]
            saw_f = True

    if quote != "'" and quote != '"':
        raise_app_valueerror(space,
                             'Internal error: parser passed unquoted literal')
    ps += 1
    q = len(s) - 1
    if s[q] != quote:
        raise_app_valueerror(
            space, 'Internal error: parser passed unmatched '
            'quotes in literal')
    if q - ps >= 4 and s[ps] == quote and s[ps + 1] == quote:
        # triple quotes
        ps += 2
        if s[q - 1] != quote or s[q - 2] != quote:
            raise_app_valueerror(
                space, 'Internal error: parser passed '
                'unmatched triple quotes in literal')
        q -= 2

    if unicode_literal and not rawmode:  # XXX Py_UnicodeFlag is ignored for now
        assert 0 <= ps <= q
        if saw_f:
            return W_FString(s[ps:q], rawmode)
        if encoding is None:
            substr = s[ps:q]
        else:
            substr = decode_unicode_utf8(space, s, ps, q)
        v = unicodehelper.decode_unicode_escape(space, substr)
        return space.newunicode(v)

    assert 0 <= ps <= q
    substr = s[ps:q]

    if not unicode_literal:
        # Disallow non-ascii characters (but not escapes)
        for c in substr:
            if ord(c) > 0x80:
                raise oefmt(
                    space.w_SyntaxError,
                    "bytes can only contain ASCII literal characters.")

    if rawmode or '\\' not in substr:
        if not unicode_literal:
            return space.newbytes(substr)
        elif saw_f:
            return W_FString(substr, rawmode)
        else:
            v = unicodehelper.decode_utf8(space, substr)
            return space.newunicode(v)

    v = PyString_DecodeEscape(space, substr, 'strict', encoding)
    return space.newbytes(v)
Ejemplo n.º 18
0
 def w_convert(self, space, s):
     if self.returns_unicode:
         from pypy.interpreter.unicodehelper import decode_utf8
         return space.newunicode(decode_utf8(space, s))
     else:
         return space.newtext(s)
Ejemplo n.º 19
0
def unmarshal_unicode(space, u, tc):
    return space.newunicode(unicodehelper.decode_utf8(space, u.get_str()))
Ejemplo n.º 20
0
def unmarshal_Unicode(space, u, tc):
    return space.wrap(unicodehelper.decode_utf8(space, u.get_str()))
Ejemplo n.º 21
0
def unmarshal_unicode(space, u, tc):
    return space.wrap(unicodehelper.decode_utf8(space, u.get_str(),
                                                allow_surrogates=True))
Ejemplo n.º 22
0
def parsestr(space, encoding, s, unicode_literal=False):
    """Parses a string or unicode literal, and return a wrapped value.

    If encoding=iso8859-1, the source string is also in this encoding.
    If encoding=None, the source string is ascii only.
    In other cases, the source string is in utf-8 encoding.

    When a bytes string is returned, it will be encoded with the
    original encoding.

    Yes, it's very inefficient.
    Yes, CPython has very similar code.
    """

    # we use ps as "pointer to s"
    # q is the virtual last char index of the string
    ps = 0
    quote = s[ps]
    rawmode = False

    # string decoration handling
    if quote == 'b' or quote == 'B':
        ps += 1
        quote = s[ps]
        unicode_literal = False
    elif quote == 'u' or quote == 'U':
        ps += 1
        quote = s[ps]
        unicode_literal = True
    if quote == 'r' or quote == 'R':
        ps += 1
        quote = s[ps]
        rawmode = True
    if quote != "'" and quote != '"':
        raise_app_valueerror(space,
                             'Internal error: parser passed unquoted literal')
    ps += 1
    q = len(s) - 1
    if s[q] != quote:
        raise_app_valueerror(space, 'Internal error: parser passed unmatched '
                                    'quotes in literal')
    if q-ps >= 4 and s[ps] == quote and s[ps+1] == quote:
        # triple quotes
        ps += 2
        if s[q-1] != quote or s[q-2] != quote:
            raise_app_valueerror(space, 'Internal error: parser passed '
                                        'unmatched triple quotes in literal')
        q -= 2

    if unicode_literal: # XXX Py_UnicodeFlag is ignored for now
        if encoding is None or encoding == "iso-8859-1":
            # 'unicode_escape' expects latin-1 bytes, string is ready.
            buf = s
            bufp = ps
            bufq = q
            u = None
        else:
            # String is utf8-encoded, but 'unicode_escape' expects
            # latin-1; So multibyte sequences must be escaped.
            lis = [] # using a list to assemble the value
            end = q
            # Worst case: "\XX" may become "\u005c\uHHLL" (12 bytes)
            while ps < end:
                if s[ps] == '\\':
                    lis.append(s[ps])
                    ps += 1
                    if ord(s[ps]) & 0x80:
                        # A multibyte sequence will follow, it will be
                        # escaped like \u1234. To avoid confusion with
                        # the backslash we just wrote, we emit "\u005c"
                        # instead.
                        lis.append("u005c")
                if ord(s[ps]) & 0x80: # XXX inefficient
                    w, ps = decode_utf8(space, s, ps, end, "utf-16-be")
                    rn = len(w)
                    assert rn % 2 == 0
                    for i in range(0, rn, 2):
                        lis.append('\\u')
                        lis.append(hexbyte(ord(w[i])))
                        lis.append(hexbyte(ord(w[i+1])))
                else:
                    lis.append(s[ps])
                    ps += 1
            buf = ''.join(lis)
            bufp = 0
            bufq = len(buf)
        assert 0 <= bufp <= bufq
        substr = buf[bufp:bufq]
        if rawmode:
            v = unicodehelper.decode_raw_unicode_escape(space, substr)
        else:
            v = unicodehelper.decode_unicode_escape(space, substr)
        return space.wrap(v)

    need_encoding = (encoding is not None and
                     encoding != "utf-8" and encoding != "utf8" and
                     encoding != "iso-8859-1")
    assert 0 <= ps <= q
    substr = s[ps : q]
    if rawmode or '\\' not in s[ps:]:
        if need_encoding:
            w_u = space.wrap(unicodehelper.decode_utf8(space, substr))
            w_v = unicodehelper.encode(space, w_u, encoding)
            return w_v
        else:
            return space.wrap(substr)

    enc = None
    if need_encoding:
        enc = encoding
    v = PyString_DecodeEscape(space, substr, enc)
    return space.wrap(v)
Ejemplo n.º 23
0
def parsestr(space, encoding, s, unicode_literal=False):
    """Parses a string or unicode literal, and return a wrapped value.

    If encoding=iso8859-1, the source string is also in this encoding.
    If encoding=None, the source string is ascii only.
    In other cases, the source string is in utf-8 encoding.

    When a bytes string is returned, it will be encoded with the
    original encoding.

    Yes, it's very inefficient.
    Yes, CPython has very similar code.
    """
    # we use ps as "pointer to s"
    # q is the virtual last char index of the string
    ps = 0
    quote = s[ps]
    rawmode = False

    # string decoration handling
    if quote == 'b' or quote == 'B':
        ps += 1
        quote = s[ps]
        unicode_literal = False
    elif quote == 'u' or quote == 'U':
        ps += 1
        quote = s[ps]
        unicode_literal = True
    if quote == 'r' or quote == 'R':
        ps += 1
        quote = s[ps]
        rawmode = True
    if quote != "'" and quote != '"':
        raise_app_valueerror(space,
                             'Internal error: parser passed unquoted literal')
    ps += 1
    q = len(s) - 1
    if s[q] != quote:
        raise_app_valueerror(
            space, 'Internal error: parser passed unmatched '
            'quotes in literal')
    if q - ps >= 4 and s[ps] == quote and s[ps + 1] == quote:
        # triple quotes
        ps += 2
        if s[q - 1] != quote or s[q - 2] != quote:
            raise_app_valueerror(
                space, 'Internal error: parser passed '
                'unmatched triple quotes in literal')
        q -= 2

    if unicode_literal:  # XXX Py_UnicodeFlag is ignored for now
        if encoding is None or encoding == "iso-8859-1":
            # 'unicode_escape' expects latin-1 bytes, string is ready.
            assert 0 <= ps <= q
            substr = s[ps:q]
        else:
            substr = decode_unicode_utf8(space, s, ps, q)
        if rawmode:
            v = unicodehelper.decode_raw_unicode_escape(space, substr)
        else:
            v = unicodehelper.decode_unicode_escape(space, substr)
        return space.newunicode(v)

    need_encoding = (encoding is not None and encoding != "utf-8"
                     and encoding != "utf8" and encoding != "iso-8859-1")
    assert 0 <= ps <= q
    substr = s[ps:q]
    if rawmode or '\\' not in s[ps:]:
        if need_encoding:
            w_u = space.newunicode(unicodehelper.decode_utf8(space, substr))
            w_v = unicodehelper.encode(space, w_u, encoding)
            return w_v
        else:
            return space.newbytes(substr)

    enc = None
    if need_encoding:
        enc = encoding
    v = PyString_DecodeEscape(space, substr, 'strict', enc)
    return space.newbytes(v)