Esempio n. 1
0
def fstring_find_literal(astbuilder, fstr, atom_node, rec):
    # Return the next literal part.  Updates the current index inside 'fstr'.
    # Differs from CPython: this version handles double-braces on its own.
    s = fstr.unparsed
    literal_start = fstr.current_index
    in_named_escape = False

    # Get any literal string. It ends when we hit an un-doubled left
    # brace (which isn't part of a unicode name escape such as
    # "\N{EULER CONSTANT}"), or the end of the string.
    i = literal_start
    builder = StringBuilder()
    while i < len(s):
        ch = s[i]
        if (not in_named_escape and ch == '{' and i - literal_start >= 2
                and s[i - 2] == '\\' and s[i - 1] == 'N'):
            in_named_escape = True
        elif in_named_escape and ch == '}':
            in_named_escape = False
        elif ch == '{' or ch == '}':
            # Check for doubled braces, but only at the top level. If
            # we checked at every level, then f'{0:{3}}' would fail
            # with the two closing braces.
            if rec == 0 and i + 1 < len(s) and s[i + 1] == ch:
                i += 1  # skip over the second brace
            elif rec == 0 and ch == '}':
                # Where a single '{' is the start of a new expression, a
                # single '}' is not allowed.
                astbuilder.error("f-string: single '}' is not allowed",
                                 atom_node)
            else:
                # We're either at a '{', which means we're starting another
                # expression; or a '}', which means we're at the end of this
                # f-string (for a nested format_spec).
                break
        builder.append(ch)
        i += 1

    fstr.current_index = i
    literal = builder.build()
    if not fstr.raw_mode and '\\' in literal:
        space = astbuilder.space
        literal = parsestring.decode_unicode_utf8(space, literal, 0,
                                                  len(literal))
        return unicodehelper.decode_unicode_escape(space, literal)
    else:
        return literal.decode('utf-8')
Esempio n. 2
0
def parsestr(space, encoding, s, unicode_literal=False):
    """Parses a string or unicode literal, and return a wrapped value.

    If encoding=iso8859-1, the source string is also in this encoding.
    If encoding=None, the source string is ascii only.
    In other cases, the source string is in utf-8 encoding.

    When a bytes string is returned, it will be encoded with the
    original encoding.

    Yes, it's very inefficient.
    Yes, CPython has very similar code.
    """

    # we use ps as "pointer to s"
    # q is the virtual last char index of the string
    ps = 0
    quote = s[ps]
    rawmode = False

    # string decoration handling
    if quote == 'b' or quote == 'B':
        ps += 1
        quote = s[ps]
        unicode_literal = False
    elif quote == 'u' or quote == 'U':
        ps += 1
        quote = s[ps]
        unicode_literal = True
    if quote == 'r' or quote == 'R':
        ps += 1
        quote = s[ps]
        rawmode = True
    if quote != "'" and quote != '"':
        raise_app_valueerror(space,
                             'Internal error: parser passed unquoted literal')
    ps += 1
    q = len(s) - 1
    if s[q] != quote:
        raise_app_valueerror(space, 'Internal error: parser passed unmatched '
                                    'quotes in literal')
    if q-ps >= 4 and s[ps] == quote and s[ps+1] == quote:
        # triple quotes
        ps += 2
        if s[q-1] != quote or s[q-2] != quote:
            raise_app_valueerror(space, 'Internal error: parser passed '
                                        'unmatched triple quotes in literal')
        q -= 2

    if unicode_literal: # XXX Py_UnicodeFlag is ignored for now
        if encoding is None or encoding == "iso-8859-1":
            # 'unicode_escape' expects latin-1 bytes, string is ready.
            buf = s
            bufp = ps
            bufq = q
            u = None
        else:
            # String is utf8-encoded, but 'unicode_escape' expects
            # latin-1; So multibyte sequences must be escaped.
            lis = [] # using a list to assemble the value
            end = q
            # Worst case: "\XX" may become "\u005c\uHHLL" (12 bytes)
            while ps < end:
                if s[ps] == '\\':
                    lis.append(s[ps])
                    ps += 1
                    if ord(s[ps]) & 0x80:
                        # A multibyte sequence will follow, it will be
                        # escaped like \u1234. To avoid confusion with
                        # the backslash we just wrote, we emit "\u005c"
                        # instead.
                        lis.append("u005c")
                if ord(s[ps]) & 0x80: # XXX inefficient
                    w, ps = decode_utf8(space, s, ps, end, "utf-16-be")
                    rn = len(w)
                    assert rn % 2 == 0
                    for i in range(0, rn, 2):
                        lis.append('\\u')
                        lis.append(hexbyte(ord(w[i])))
                        lis.append(hexbyte(ord(w[i+1])))
                else:
                    lis.append(s[ps])
                    ps += 1
            buf = ''.join(lis)
            bufp = 0
            bufq = len(buf)
        assert 0 <= bufp <= bufq
        substr = buf[bufp:bufq]
        if rawmode:
            v = unicodehelper.decode_raw_unicode_escape(space, substr)
        else:
            v = unicodehelper.decode_unicode_escape(space, substr)
        return space.wrap(v)

    need_encoding = (encoding is not None and
                     encoding != "utf-8" and encoding != "utf8" and
                     encoding != "iso-8859-1")
    assert 0 <= ps <= q
    substr = s[ps : q]
    if rawmode or '\\' not in s[ps:]:
        if need_encoding:
            w_u = space.wrap(unicodehelper.decode_utf8(space, substr))
            w_v = unicodehelper.encode(space, w_u, encoding)
            return w_v
        else:
            return space.wrap(substr)

    enc = None
    if need_encoding:
        enc = encoding
    v = PyString_DecodeEscape(space, substr, enc)
    return space.wrap(v)
Esempio n. 3
0
def parsestr(space, encoding, s, unicode_literal=False):
    """Parses a string or unicode literal, and return a wrapped value.

    If encoding=iso8859-1, the source string is also in this encoding.
    If encoding=None, the source string is ascii only.
    In other cases, the source string is in utf-8 encoding.

    When a bytes string is returned, it will be encoded with the
    original encoding.

    Yes, it's very inefficient.
    Yes, CPython has very similar code.
    """
    # we use ps as "pointer to s"
    # q is the virtual last char index of the string
    ps = 0
    quote = s[ps]
    rawmode = False

    # string decoration handling
    if quote == 'b' or quote == 'B':
        ps += 1
        quote = s[ps]
        unicode_literal = False
    elif quote == 'u' or quote == 'U':
        ps += 1
        quote = s[ps]
        unicode_literal = True
    if quote == 'r' or quote == 'R':
        ps += 1
        quote = s[ps]
        rawmode = True
    if quote != "'" and quote != '"':
        raise_app_valueerror(space,
                             'Internal error: parser passed unquoted literal')
    ps += 1
    q = len(s) - 1
    if s[q] != quote:
        raise_app_valueerror(space, 'Internal error: parser passed unmatched '
                                    'quotes in literal')
    if q-ps >= 4 and s[ps] == quote and s[ps+1] == quote:
        # triple quotes
        ps += 2
        if s[q-1] != quote or s[q-2] != quote:
            raise_app_valueerror(space, 'Internal error: parser passed '
                                        'unmatched triple quotes in literal')
        q -= 2

    if unicode_literal: # XXX Py_UnicodeFlag is ignored for now
        if encoding is None or encoding == "iso-8859-1":
            # 'unicode_escape' expects latin-1 bytes, string is ready.
            assert 0 <= ps <= q
            substr = s[ps:q]
        else:
            substr = decode_unicode_utf8(space, s, ps, q)
        if rawmode:
            v = unicodehelper.decode_raw_unicode_escape(space, substr)
        else:
            v = unicodehelper.decode_unicode_escape(space, substr)
        return space.wrap(v)

    need_encoding = (encoding is not None and
                     encoding != "utf-8" and encoding != "utf8" and
                     encoding != "iso-8859-1")
    assert 0 <= ps <= q
    substr = s[ps : q]
    if rawmode or '\\' not in s[ps:]:
        if need_encoding:
            w_u = space.wrap(unicodehelper.decode_utf8(space, substr))
            w_v = unicodehelper.encode(space, w_u, encoding)
            return w_v
        else:
            return space.wrap(substr)

    enc = None
    if need_encoding:
        enc = encoding
    v = PyString_DecodeEscape(space, substr, 'strict', enc)
    return space.wrap(v)
Esempio n. 4
0
def parsestr(space, encoding, s, unicode_literal=False):
    """Parses a string or unicode literal, and return a wrapped value.

    If encoding=iso8859-1, the source string is also in this encoding.
    If encoding=None, the source string is ascii only.
    In other cases, the source string is in utf-8 encoding.

    When a bytes string is returned, it will be encoded with the
    original encoding.

    Yes, it's very inefficient.
    Yes, CPython has very similar code.
    """
    # we use ps as "pointer to s"
    # q is the virtual last char index of the string
    ps = 0
    quote = s[ps]
    rawmode = False

    # string decoration handling
    if quote == 'b' or quote == 'B':
        ps += 1
        quote = s[ps]
        unicode_literal = False
    elif quote == 'u' or quote == 'U':
        ps += 1
        quote = s[ps]
        unicode_literal = True
    if quote == 'r' or quote == 'R':
        ps += 1
        quote = s[ps]
        rawmode = True
    if quote != "'" and quote != '"':
        raise_app_valueerror(space,
                             'Internal error: parser passed unquoted literal')
    ps += 1
    q = len(s) - 1
    if s[q] != quote:
        raise_app_valueerror(
            space, 'Internal error: parser passed unmatched '
            'quotes in literal')
    if q - ps >= 4 and s[ps] == quote and s[ps + 1] == quote:
        # triple quotes
        ps += 2
        if s[q - 1] != quote or s[q - 2] != quote:
            raise_app_valueerror(
                space, 'Internal error: parser passed '
                'unmatched triple quotes in literal')
        q -= 2

    if unicode_literal:
        if encoding is None or encoding == "iso-8859-1":
            # 'unicode_escape' expects latin-1 bytes, string is ready.
            assert 0 <= ps <= q
            substr = s[ps:q]
        else:
            unicodehelper.check_utf8_or_raise(space, s, ps, q)
            substr = decode_unicode_utf8(space, s, ps, q)
        if rawmode:
            r = unicodehelper.decode_raw_unicode_escape(space, substr)
        else:
            r = unicodehelper.decode_unicode_escape(space, substr)
        v, length = r
        return space.newutf8(v, length)

    need_encoding = (encoding is not None and encoding != "utf-8"
                     and encoding != "utf8" and encoding != "iso-8859-1")
    assert 0 <= ps <= q
    substr = s[ps:q]
    if rawmode or '\\' not in s[ps:]:
        if need_encoding:
            lgt = unicodehelper.check_utf8_or_raise(space, substr)
            w_u = space.newutf8(substr, lgt)
            w_v = unicodehelper.encode(space, w_u, encoding)
            return w_v
        else:
            return space.newbytes(substr)

    enc = None
    if need_encoding:
        enc = encoding
    v = PyString_DecodeEscape(space, substr, 'strict', enc)
    return space.newbytes(v)
Esempio n. 5
0
def parsestr(space, encoding, s):
    """Parses a string or unicode literal, and return a wrapped value.

    If encoding=None, the source string is ascii only.
    In other cases, the source string is in utf-8 encoding.

    When a bytes string is returned, it will be encoded with the
    original encoding.

    Yes, it's very inefficient.
    Yes, CPython has very similar code.
    """
    # we use ps as "pointer to s"
    # q is the virtual last char index of the string
    ps = 0
    quote = s[ps]
    rawmode = False
    unicode_literal = True
    saw_u = False

    # string decoration handling
    if quote == "b" or quote == "B":
        ps += 1
        quote = s[ps]
        unicode_literal = False
    elif quote == "u" or quote == "U":
        ps += 1
        quote = s[ps]
        saw_u = True
    if not saw_u and quote == "r" or quote == "R":
        ps += 1
        quote = s[ps]
        rawmode = True
    if quote != "'" and quote != '"':
        raise_app_valueerror(space, "Internal error: parser passed unquoted literal")
    ps += 1
    q = len(s) - 1
    if s[q] != quote:
        raise_app_valueerror(space, "Internal error: parser passed unmatched " "quotes in literal")
    if q - ps >= 4 and s[ps] == quote and s[ps + 1] == quote:
        # triple quotes
        ps += 2
        if s[q - 1] != quote or s[q - 2] != quote:
            raise_app_valueerror(space, "Internal error: parser passed " "unmatched triple quotes in literal")
        q -= 2

    if unicode_literal and not rawmode:  # XXX Py_UnicodeFlag is ignored for now
        if encoding is None:
            assert 0 <= ps <= q
            substr = s[ps:q]
        else:
            substr = decode_unicode_utf8(space, s, ps, q)
        v = unicodehelper.decode_unicode_escape(space, substr)
        return space.wrap(v)

    assert 0 <= ps <= q
    substr = s[ps:q]

    if not unicode_literal:
        # Disallow non-ascii characters (but not escapes)
        for c in substr:
            if ord(c) > 0x80:
                raise OperationError(
                    space.w_SyntaxError, space.wrap("bytes can only contain ASCII literal characters.")
                )

    if rawmode or "\\" not in substr:
        if not unicode_literal:
            return space.wrapbytes(substr)
        else:
            v = unicodehelper.decode_utf8(space, substr)
            return space.wrap(v)

    v = PyString_DecodeEscape(space, substr, "strict", encoding)
    return space.wrapbytes(v)
Esempio n. 6
0
def parsestr(space, encoding, s):
    """Parses a string or unicode literal, and return usually
    a wrapped value.  If we get an f-string, then instead return
    an unparsed but unquoted W_FString instance.

    If encoding=None, the source string is ascii only.
    In other cases, the source string is in utf-8 encoding.

    When a bytes string is returned, it will be encoded with the
    original encoding.

    Yes, it's very inefficient.
    Yes, CPython has very similar code.
    """
    # we use ps as "pointer to s"
    # q is the virtual last char index of the string
    ps = 0
    quote = s[ps]
    rawmode = False
    unicode_literal = True
    saw_u = False
    saw_f = False

    # string decoration handling
    if quote == 'b' or quote == 'B':
        ps += 1
        quote = s[ps]
        unicode_literal = False
    elif quote == 'u' or quote == 'U':
        ps += 1
        quote = s[ps]
        saw_u = True
    elif quote == 'r' or quote == 'R':
        ps += 1
        quote = s[ps]
        rawmode = True
    elif quote == 'f' or quote == 'F':
        ps += 1
        quote = s[ps]
        saw_f = True

    if not saw_u:
        if quote == 'r' or quote == 'R':
            ps += 1
            quote = s[ps]
            rawmode = True
        elif quote == 'b' or quote == 'B':
            ps += 1
            quote = s[ps]
            unicode_literal = False
        elif quote == 'f' or quote == 'F':
            ps += 1
            quote = s[ps]
            saw_f = True

    if quote != "'" and quote != '"':
        raise_app_valueerror(space,
                             'Internal error: parser passed unquoted literal')
    ps += 1
    q = len(s) - 1
    if s[q] != quote:
        raise_app_valueerror(
            space, 'Internal error: parser passed unmatched '
            'quotes in literal')
    if q - ps >= 4 and s[ps] == quote and s[ps + 1] == quote:
        # triple quotes
        ps += 2
        if s[q - 1] != quote or s[q - 2] != quote:
            raise_app_valueerror(
                space, 'Internal error: parser passed '
                'unmatched triple quotes in literal')
        q -= 2

    if unicode_literal and not rawmode:  # XXX Py_UnicodeFlag is ignored for now
        assert 0 <= ps <= q
        if saw_f:
            return W_FString(s[ps:q], rawmode)
        if encoding is None:
            substr = s[ps:q]
        else:
            substr = decode_unicode_utf8(space, s, ps, q)
        v = unicodehelper.decode_unicode_escape(space, substr)
        return space.newunicode(v)

    assert 0 <= ps <= q
    substr = s[ps:q]

    if not unicode_literal:
        # Disallow non-ascii characters (but not escapes)
        for c in substr:
            if ord(c) > 0x80:
                raise oefmt(
                    space.w_SyntaxError,
                    "bytes can only contain ASCII literal characters.")

    if rawmode or '\\' not in substr:
        if not unicode_literal:
            return space.newbytes(substr)
        elif saw_f:
            return W_FString(substr, rawmode)
        else:
            v = unicodehelper.decode_utf8(space, substr)
            return space.newunicode(v)

    v = PyString_DecodeEscape(space, substr, 'strict', encoding)
    return space.newbytes(v)
Esempio n. 7
0
def fstring_find_literal(astbuilder, fstr, atom_node, rec):
    space = astbuilder.space
    raw = fstr.raw_mode

    # Return the next literal part.  Updates the current index inside 'fstr'.
    # Differs from CPython: this version handles double-braces on its own.
    s = fstr.unparsed
    literal_start = fstr.current_index
    assert literal_start >= 0

    # Get any literal string. It ends when we hit an un-doubled left
    # brace (which isn't part of a unicode name escape such as
    # "\N{EULER CONSTANT}"), or the end of the string.
    i = literal_start
    builder = StringBuilder()
    while i < len(s):
        ch = s[i]
        i += 1
        if not raw and ch == '\\' and i < len(s):
            ch = s[i]
            i += 1
            if ch == 'N':
                if i < len(s) and s[i] == '{':
                    while i < len(s) and s[i] != '}':
                        i += 1
                    if i < len(s):
                        i += 1
                    continue
                elif i < len(s):
                    i += 1
                break
            if ch == '{':
                msg = "invalid escape sequence '%s'"
                try:
                    space.warn(space.newtext(msg % ch),
                               space.w_DeprecationWarning)
                except error.OperationError as e:
                    if e.match(space, space.w_DeprecationWarning):
                        astbuilder.error(msg % ch, atom_node)
                    else:
                        raise
        if ch == '{' or ch == '}':
            # Check for doubled braces, but only at the top level. If
            # we checked at every level, then f'{0:{3}}' would fail
            # with the two closing braces.
            if rec == 0 and i < len(s) and s[i] == ch:
                assert 0 <= i <= len(s)
                builder.append(s[literal_start:i])
                i += 1  # skip over the second brace
                literal_start = i
            elif rec == 0 and ch == '}':
                i -= 1
                assert i >= 0
                fstr.current_index = i
                # Where a single '{' is the start of a new expression, a
                # single '}' is not allowed.
                astbuilder.error("f-string: single '}' is not allowed",
                                 atom_node)
            else:
                # We're either at a '{', which means we're starting another
                # expression; or a '}', which means we're at the end of this
                # f-string (for a nested format_spec).
                i -= 1
                break
    assert 0 <= i <= len(s)
    assert i == len(s) or s[i] == '{' or s[i] == '}'
    builder.append(s[literal_start:i])

    fstr.current_index = i
    literal = builder.build()
    lgt = codepoints_in_utf8(literal)
    if not raw and '\\' in literal:
        literal = parsestring.decode_unicode_utf8(space, literal, 0,
                                                  len(literal))
        literal, lgt, pos = unicodehelper.decode_unicode_escape(space, literal)
    return space.newtext(literal, lgt)