def get_tokens(self, text): if isinstance(text, str): # raw token stream never has any non-ASCII characters text = text.encode('ascii') if self.compress == 'gz': import gzip gzipfile = gzip.GzipFile('', 'rb', 9, io.StringIO(text)) text = gzipfile.read() elif self.compress == 'bz2': import bz2 text = bz2.decompress(text) # do not call Lexer.get_tokens() because we do not want Unicode # decoding to occur, and stripping is not optional. text = text.strip(b('\n')) + b('\n') for i, t, v in self.get_tokens_unprocessed(text): yield t, v
def get_tokens(self, text): if isinstance(text, str): # raw token stream never has any non-ASCII characters text = text.encode("ascii") if self.compress == "gz": import gzip gzipfile = gzip.GzipFile("", "rb", 9, io.StringIO(text)) text = gzipfile.read() elif self.compress == "bz2": import bz2 text = bz2.decompress(text) # do not call Lexer.get_tokens() because we do not want Unicode # decoding to occur, and stripping is not optional. text = text.strip(b("\n")) + b("\n") for i, t, v in self.get_tokens_unprocessed(text): yield t, v
def format(self, tokensource, outfile): try: outfile.write(b('')) except TypeError: raise TypeError('The raw tokens formatter needs a binary ' 'output file') if self.compress == 'gz': import gzip outfile = gzip.GzipFile('', 'wb', 9, outfile) def write(text): outfile.write(text.encode()) flush = outfile.flush elif self.compress == 'bz2': import bz2 compressor = bz2.BZ2Compressor(9) def write(text): outfile.write(compressor.compress(text.encode())) def flush(): outfile.write(compressor.flush()) outfile.flush() else: def write(text): outfile.write(text.encode()) flush = outfile.flush lasttype = None lastval = '' if self.error_color: for ttype, value in tokensource: line = "%s\t%r\n" % (ttype, value) if ttype is Token.Error: write(colorize(self.error_color, line)) else: write(line) else: for ttype, value in tokensource: write("%s\t%r\n" % (ttype, value)) flush()
def get_tokens_unprocessed(self, text): length = 0 for match in line_re.finditer(text): try: ttypestr, val = match.group().split(b("\t"), 1) except ValueError: val = match.group().decode(self.encoding) ttype = Error else: ttype = _ttype_cache.get(ttypestr) if not ttype: ttype = Token ttypes = ttypestr.split(".")[1:] for ttype_ in ttypes: if not ttype_ or not ttype_[0].isupper(): raise ValueError("malformed token name") ttype = getattr(ttype, ttype_) _ttype_cache[ttypestr] = ttype val = val[2:-2].decode("unicode-escape") yield length, ttype, val length += len(val)
def get_tokens_unprocessed(self, text): length = 0 for match in line_re.finditer(text): try: ttypestr, val = match.group().split(b('\t'), 1) except ValueError: val = match.group().decode(self.encoding) ttype = Error else: ttype = _ttype_cache.get(ttypestr) if not ttype: ttype = Token ttypes = ttypestr.split('.')[1:] for ttype_ in ttypes: if not ttype_ or not ttype_[0].isupper(): raise ValueError('malformed token name') ttype = getattr(ttype, ttype_) _ttype_cache[ttypestr] = ttype val = val[2:-2].decode('unicode-escape') yield length, ttype, val length += len(val)
""" "Null" lexer, doesn't highlight anything. """ name = "Text only" aliases = ["text"] filenames = ["*.txt"] mimetypes = ["text/plain"] def get_tokens_unprocessed(self, text): yield 0, Text, text _ttype_cache = {} line_re = re.compile(b(".*?\n")) class RawTokenLexer(Lexer): """ Recreate a token stream formatted with the `RawTokenFormatter`. This lexer raises exceptions during parsing if the token stream in the file is malformed. Additional options accepted: `compress` If set to ``"gz"`` or ``"bz2"``, decompress the token stream with the given compression algorithm before lexing (default: ``""``). """
class TextLexer(Lexer): """ "Null" lexer, doesn't highlight anything. """ name = 'Text only' aliases = ['text'] filenames = ['*.txt'] mimetypes = ['text/plain'] def get_tokens_unprocessed(self, text): yield 0, Text, text _ttype_cache = {} line_re = re.compile(b('.*?\n')) class RawTokenLexer(Lexer): """ Recreate a token stream formatted with the `RawTokenFormatter`. This lexer raises exceptions during parsing if the token stream in the file is malformed. Additional options accepted: `compress` If set to ``"gz"`` or ``"bz2"``, decompress the token stream with the given compression algorithm before lexing (default: ``""``). """ name = 'Raw token data' aliases = ['raw']