def test_with_example_file(self): filename = path.join(DIR, 'test.common-lisp-repl') lexer = CommonLispREPLLexer() fp = open(filename, 'rb') try: text = fp.read() finally: fp.close() text = text.replace(b('\r\n'), b('\n')) text = text.strip(b('\n')) + b('\n') try: text = text.decode('utf-8') if text.startswith(u'\ufeff'): text = text[len(u'\ufeff'):] except UnicodeError: text = text.decode('latin1') ntext = [] tokens = [] for type, val in lexer.get_tokens(text): ntext.append(val) self.assertNotEqual( type, Error, 'error %r at position %d' % (val, len(u''.join(ntext)))) tokens.append((type, val)) self.assertEqual( u''.join(ntext), text, '\n'.join( difflib.unified_diff(u''.join(ntext).splitlines(), text.splitlines())))
def test_with_example_file(self): filename = path.join(DIR, 'test.common-lisp-repl') lexer = CommonLispREPLLexer() fp = open(filename, 'rb') try: text = fp.read() finally: fp.close() text = text.replace(b('\r\n'), b('\n')) text = text.strip(b('\n')) + b('\n') try: text = text.decode('utf-8') if text.startswith(u'\ufeff'): text = text[len(u'\ufeff'):] except UnicodeError: text = text.decode('latin1') ntext = [] tokens = [] for type, val in lexer.get_tokens(text): ntext.append(val) self.assertNotEqual( type, Error, 'error %r at position %d' % (val, len(u''.join(ntext)))) tokens.append((type, val)) self.assertEqual( u''.join(ntext), text, '\n'.join(difflib.unified_diff(u''.join(ntext).splitlines(), text.splitlines())))
def check_lexer(lx, absfn, outfn): fp = open(absfn, 'rb') try: text = fp.read() finally: fp.close() text = text.replace(b('\r\n'), b('\n')) text = text.strip(b('\n')) + b('\n') try: text = text.decode('utf-8') if text.startswith(u'\ufeff'): text = text[len(u'\ufeff'):] except UnicodeError: text = text.decode('latin1') ntext = [] tokens = [] for type, val in lx.get_tokens(text): ntext.append(val) assert type != Error, \ 'lexer %s generated error token for %s: %r at position %d' % \ (lx, absfn, val, len(u''.join(ntext))) tokens.append((type, val)) if u''.join(ntext) != text: print '\n'.join( difflib.unified_diff(u''.join(ntext).splitlines(), text.splitlines())) raise AssertionError('round trip failed for ' + absfn) # check output against previous run if enabled if STORE_OUTPUT: # no previous output -- store it if not os.path.isfile(outfn): fp = open(outfn, 'wb') try: pickle.dump(tokens, fp) finally: fp.close() return # otherwise load it and compare fp = open(outfn, 'rb') try: stored_tokens = pickle.load(fp) finally: fp.close() if stored_tokens != tokens: f1 = pprint.pformat(stored_tokens) f2 = pprint.pformat(tokens) print '\n'.join( difflib.unified_diff(f1.splitlines(), f2.splitlines())) assert False, absfn
def check_lexer(lx, absfn): text = open(absfn, 'rb').read() text = text.strip(b('\n')) + b('\n') try: text = text.decode('utf-8') except UnicodeError: text = text.decode('latin1') ntext = [] for type, val in lx.get_tokens(text): ntext.append(val) assert type != Error, 'lexer %s generated error token for %s' % \ (lx, absfn) if u''.join(ntext) != text: raise AssertionError('round trip failed for ' + absfn)
def check_lexer(lx, absfn, outfn): fp = open(absfn, 'rb') try: text = fp.read() finally: fp.close() text = text.replace(b('\r\n'), b('\n')) text = text.strip(b('\n')) + b('\n') try: text = text.decode('utf-8') if text.startswith(u'\ufeff'): text = text[len(u'\ufeff'):] except UnicodeError: text = text.decode('latin1') ntext = [] tokens = [] for type, val in lx.get_tokens(text): ntext.append(val) assert type != Error, \ 'lexer %s generated error token for %s: %r at position %d' % \ (lx, absfn, val, len(u''.join(ntext))) tokens.append((type, val)) if u''.join(ntext) != text: print '\n'.join(difflib.unified_diff(u''.join(ntext).splitlines(), text.splitlines())) raise AssertionError('round trip failed for ' + absfn) # check output against previous run if enabled if STORE_OUTPUT: # no previous output -- store it if not os.path.isfile(outfn): fp = open(outfn, 'wb') try: pickle.dump(tokens, fp) finally: fp.close() return # otherwise load it and compare fp = open(outfn, 'rb') try: stored_tokens = pickle.load(fp) finally: fp.close() if stored_tokens != tokens: f1 = pprint.pformat(stored_tokens) f2 = pprint.pformat(tokens) print '\n'.join(difflib.unified_diff(f1.splitlines(), f2.splitlines())) assert False, absfn
def check_lexer(lx, absfn): text = open(absfn, 'rb').read() text = text.replace(b('\r\n'), b('\n')) text = text.strip(b('\n')) + b('\n') try: text = text.decode('utf-8') except UnicodeError: text = text.decode('latin1') ntext = [] for type, val in lx.get_tokens(text): ntext.append(val) assert type != Error, 'lexer %s generated error token for %s' % \ (lx, absfn) if u''.join(ntext) != text: raise AssertionError('round trip failed for ' + absfn)
def get_tokens(self, text): if isinstance(text, str): # raw token stream never has any non-ASCII characters text = text.encode('ascii') if self.compress == 'gz': import gzip gzipfile = gzip.GzipFile('', 'rb', 9, io.StringIO(text)) text = gzipfile.read() elif self.compress == 'bz2': import bz2 text = bz2.decompress(text) # do not call Lexer.get_tokens() because we do not want Unicode # decoding to occur, and stripping is not optional. text = text.strip(b('\n')) + b('\n') for i, t, v in self.get_tokens_unprocessed(text): yield t, v
def get_tokens(self, text): if isinstance(text, unicode): # raw token stream never has any non-ASCII characters text = text.encode('ascii') if self.compress == 'gz': import gzip gzipfile = gzip.GzipFile('', 'rb', 9, cStringIO.StringIO(text)) text = gzipfile.read() elif self.compress == 'bz2': import bz2 text = bz2.decompress(text) # do not call Lexer.get_tokens() because we do not want Unicode # decoding to occur, and stripping is not optional. text = text.strip(b('\n')) + b('\n') for i, t, v in self.get_tokens_unprocessed(text): yield t, v
def get_tokens(self, text): if isinstance(text, str): # raw token stream never has any non-ASCII characters text = text.encode("ascii") if self.compress == "gz": import gzip gzipfile = gzip.GzipFile("", "rb", 9, io.StringIO(text)) text = gzipfile.read() elif self.compress == "bz2": import bz2 text = bz2.decompress(text) # do not call Lexer.get_tokens() because we do not want Unicode # decoding to occur, and stripping is not optional. text = text.strip(b("\n")) + b("\n") for i, t, v in self.get_tokens_unprocessed(text): yield t, v
def format(self, tokensource, outfile): try: outfile.write(b("")) except TypeError: raise TypeError("The raw tokens formatter needs a binary " "output file") if self.compress == "gz": import gzip outfile = gzip.GzipFile("", "wb", 9, outfile) def write(text): outfile.write(text.encode()) flush = outfile.flush elif self.compress == "bz2": import bz2 compressor = bz2.BZ2Compressor(9) def write(text): outfile.write(compressor.compress(text.encode())) def flush(): outfile.write(compressor.flush()) outfile.flush() else: def write(text): outfile.write(text.encode()) flush = outfile.flush lasttype = None lastval = u"" if self.error_color: for ttype, value in tokensource: line = "%s\t%r\n" % (ttype, value) if ttype is Token.Error: write(colorize(self.error_color, line)) else: write(line) else: for ttype, value in tokensource: write("%s\t%r\n" % (ttype, value)) flush()
def format(self, tokensource, outfile): try: outfile.write(b('')) except TypeError: raise TypeError('The raw tokens formatter needs a binary ' 'output file') if self.compress == 'gz': import gzip outfile = gzip.GzipFile('', 'wb', 9, outfile) def write(text): outfile.write(text.encode()) flush = outfile.flush elif self.compress == 'bz2': import bz2 compressor = bz2.BZ2Compressor(9) def write(text): outfile.write(compressor.compress(text.encode())) def flush(): outfile.write(compressor.flush()) outfile.flush() else: def write(text): outfile.write(text.encode()) flush = outfile.flush lasttype = None lastval = u'' if self.error_color: for ttype, value in tokensource: line = "%s\t%r\n" % (ttype, value) if ttype is Token.Error: write(colorize(self.error_color, line)) else: write(line) else: for ttype, value in tokensource: write("%s\t%r\n" % (ttype, value)) flush()
def get_tokens_unprocessed(self, text): length = 0 for match in line_re.finditer(text): try: ttypestr, val = match.group().split(b('\t'), 1) except ValueError: val = match.group().decode(self.encoding) ttype = Error else: ttype = _ttype_cache.get(ttypestr) if not ttype: ttype = Token ttypes = ttypestr.split('.')[1:] for ttype_ in ttypes: if not ttype_ or not ttype_[0].isupper(): raise ValueError('malformed token name') ttype = getattr(ttype, ttype_) _ttype_cache[ttypestr] = ttype val = val[2:-2].decode('unicode-escape') yield length, ttype, val length += len(val)
def format(self, tokensource, outfile): try: outfile.write(b('')) except TypeError: raise TypeError('The raw tokens formatter needs a binary ' 'output file') if self.compress == 'gz': import gzip outfile = gzip.GzipFile('', 'wb', 9, outfile) def write(text): outfile.write(text.encode()) flush = outfile.flush elif self.compress == 'bz2': import bz2 compressor = bz2.BZ2Compressor(9) def write(text): outfile.write(compressor.compress(text.encode())) def flush(): outfile.write(compressor.flush()) outfile.flush() else: def write(text): outfile.write(text.encode()) flush = outfile.flush if self.error_color: for ttype, value in tokensource: line = "%s\t%r\n" % (ttype, value) if ttype is Token.Error: write(colorize(self.error_color, line)) else: write(line) else: for ttype, value in tokensource: write("%s\t%r\n" % (ttype, value)) flush()
class TextLexer(Lexer): """ "Null" lexer, doesn't highlight anything. """ name = 'Text only' aliases = ['text'] filenames = ['*.txt'] mimetypes = ['text/plain'] def get_tokens_unprocessed(self, text): yield 0, Text, text _ttype_cache = {} line_re = re.compile(b('.*?\n')) class RawTokenLexer(Lexer): """ Recreate a token stream formatted with the `RawTokenFormatter`. This lexer raises exceptions during parsing if the token stream in the file is malformed. Additional options accepted: `compress` If set to ``"gz"`` or ``"bz2"``, decompress the token stream with the given compression algorithm before lexing (default: ``""``). """ name = 'Raw token data' aliases = ['raw']
class TextLexer(Lexer): """ "Null" lexer, doesn't highlight anything. """ name = 'Text only' aliases = ['text'] filenames = ['*.txt'] mimetypes = ['text/plain'] def get_tokens_unprocessed(self, text): yield 0, Text, text _ttype_cache = {} line_re = re.compile(b('.*?\n')) class RawTokenLexer(Lexer): """ Recreate a token stream formatted with the `RawTokenFormatter`. This lexer raises exceptions during parsing if the token stream in the file is malformed. Additional options accepted: `compress` If set to ``"gz"`` or ``"bz2"``, decompress the token stream with the given compression algorithm before lexing (default: ``""``). """ name = 'Raw token data'
""" "Null" lexer, doesn't highlight anything. """ name = "Text only" aliases = ["text"] filenames = ["*.txt"] mimetypes = ["text/plain"] def get_tokens_unprocessed(self, text): yield 0, Text, text _ttype_cache = {} line_re = re.compile(b(".*?\n")) class RawTokenLexer(Lexer): """ Recreate a token stream formatted with the `RawTokenFormatter`. This lexer raises exceptions during parsing if the token stream in the file is malformed. Additional options accepted: `compress` If set to ``"gz"`` or ``"bz2"``, decompress the token stream with the given compression algorithm before lexing (default: ``""``). """