def test_newline_bytes(self): import _io # Issue 5433: Excessive optimization in IncrementalNewlineDecoder def _check(dec): assert dec.newlines is None assert dec.decode("\u0D00") == "\u0D00" assert dec.newlines is None assert dec.decode("\u0A00") == "\u0A00" assert dec.newlines is None dec = _io.IncrementalNewlineDecoder(None, translate=False) _check(dec) dec = _io.IncrementalNewlineDecoder(None, translate=True) _check(dec)
def test_newlines2(): inner_decoder = codecs.getincrementaldecoder("utf-8")() decoder = _io.IncrementalNewlineDecoder(inner_decoder, translate=True) msg = b"abc\r\n\n\r\r\n\n" decoded = '' for ch in msg: decoded += decoder.decode(bytes([ch])) assert set(decoder.newlines) == {"\r", "\n", "\r\n"}
def decode_source(source_bytes): # copied from _bootstrap_external.py """Decode bytes representing source code and return the string. Universal newline support is used in the decoding. """ import _io import tokenize # To avoid bootstrap issues. source_bytes_readline = _io.BytesIO(source_bytes).readline encoding = tokenize.detect_encoding(source_bytes_readline) newline_decoder = _io.IncrementalNewlineDecoder(None, True) return newline_decoder.decode(source_bytes.decode(encoding[0]))
def test_newline_decoder(self): import _io def check_newline_decoding_utf8(decoder): # UTF-8 specific tests for a newline decoder def _check_decode(b, s, **kwargs): # We exercise getstate() / setstate() as well as decode() state = decoder.getstate() assert decoder.decode(b, **kwargs) == s decoder.setstate(state) assert decoder.decode(b, **kwargs) == s _check_decode(b'\xe8\xa2\x88', u"\u8888") _check_decode(b'\xe8', "") _check_decode(b'\xa2', "") _check_decode(b'\x88', u"\u8888") _check_decode(b'\xe8', "") _check_decode(b'\xa2', "") _check_decode(b'\x88', u"\u8888") _check_decode(b'\xe8', "") raises(UnicodeDecodeError, decoder.decode, b'', final=True) decoder.reset() _check_decode(b'\n', "\n") _check_decode(b'\r', "") _check_decode(b'', "\n", final=True) _check_decode(b'\r', "\n", final=True) _check_decode(b'\r', "") _check_decode(b'a', "\na") _check_decode(b'\r\r\n', "\n\n") _check_decode(b'\r', "") _check_decode(b'\r', "\n") _check_decode(b'\na', "\na") _check_decode(b'\xe8\xa2\x88\r\n', u"\u8888\n") _check_decode(b'\xe8\xa2\x88', u"\u8888") _check_decode(b'\n', "\n") _check_decode(b'\xe8\xa2\x88\r', u"\u8888") _check_decode(b'\n', "\n") def check_newline_decoding(decoder, encoding): result = [] if encoding is not None: encoder = codecs.getincrementalencoder(encoding)() def _decode_bytewise(s): # Decode one byte at a time for b in encoder.encode(s): result.append(decoder.decode(b)) else: encoder = None def _decode_bytewise(s): # Decode one char at a time for c in s: result.append(decoder.decode(c)) assert decoder.newlines == None _decode_bytewise(u"abc\n\r") assert decoder.newlines == '\n' _decode_bytewise(u"\nabc") assert decoder.newlines == ('\n', '\r\n') _decode_bytewise(u"abc\r") assert decoder.newlines == ('\n', '\r\n') _decode_bytewise(u"abc") assert decoder.newlines == ('\r', '\n', '\r\n') _decode_bytewise(u"abc\r") assert "".join(result) == "abc\n\nabcabc\nabcabc" decoder.reset() input = u"abc" if encoder is not None: encoder.reset() input = encoder.encode(input) assert decoder.decode(input) == "abc" assert decoder.newlines is None encodings = ( # None meaning the IncrementalNewlineDecoder takes unicode input # rather than bytes input None, 'utf-8', 'latin-1', 'utf-16', 'utf-16-le', 'utf-16-be', 'utf-32', 'utf-32-le', 'utf-32-be', ) import codecs for enc in encodings: decoder = enc and codecs.getincrementaldecoder(enc)() decoder = _io.IncrementalNewlineDecoder(decoder, translate=True) check_newline_decoding(decoder, enc) decoder = codecs.getincrementaldecoder("utf-8")() decoder = _io.IncrementalNewlineDecoder(decoder, translate=True) check_newline_decoding_utf8(decoder)
def test_cr_not_ignored2(self): d = _io.IncrementalNewlineDecoder(None, translate=False) d.decode("h\n\r") d.decode("\n") self.assertEqual(('\n', '\r\n'), d.newlines)
def test_cr_not_ignored(self): d = _io.IncrementalNewlineDecoder(None, translate=False) d.decode("h\rello") self.assertEqual('\r', d.newlines)