Esempio n. 1
0
 def test_with_example_file(self):
     filename = path.join(DIR, 'test.common-lisp-repl')
     lexer = CommonLispREPLLexer()
     fp = open(filename, 'rb')
     try:
         text = fp.read()
     finally:
         fp.close()
     text = text.replace(b('\r\n'), b('\n'))
     text = text.strip(b('\n')) + b('\n')
     try:
         text = text.decode('utf-8')
         if text.startswith(u'\ufeff'):
             text = text[len(u'\ufeff'):]
     except UnicodeError:
         text = text.decode('latin1')
     ntext = []
     tokens = []
     for type, val in lexer.get_tokens(text):
         ntext.append(val)
         self.assertNotEqual(
             type, Error,
             'error %r at position %d' % (val, len(u''.join(ntext))))
         tokens.append((type, val))
     self.assertEqual(
         u''.join(ntext), text, '\n'.join(
             difflib.unified_diff(u''.join(ntext).splitlines(),
                                  text.splitlines())))
Esempio n. 2
0
 def test_with_example_file(self):
     filename = path.join(DIR, 'test.common-lisp-repl')
     lexer = CommonLispREPLLexer()
     fp = open(filename, 'rb')
     try:
         text = fp.read()
     finally:
         fp.close()
     text = text.replace(b('\r\n'), b('\n'))
     text = text.strip(b('\n')) + b('\n')
     try:
         text = text.decode('utf-8')
         if text.startswith(u'\ufeff'):
             text = text[len(u'\ufeff'):]
     except UnicodeError:
         text = text.decode('latin1')
     ntext = []
     tokens = []
     for type, val in lexer.get_tokens(text):
         ntext.append(val)
         self.assertNotEqual(
             type, Error,
             'error %r at position %d' % (val, len(u''.join(ntext))))
         tokens.append((type, val))
     self.assertEqual(
         u''.join(ntext), text,
         '\n'.join(difflib.unified_diff(u''.join(ntext).splitlines(),
                                        text.splitlines())))
Esempio n. 3
0
def check_lexer(lx, absfn, outfn):
    fp = open(absfn, 'rb')
    try:
        text = fp.read()
    finally:
        fp.close()
    text = text.replace(b('\r\n'), b('\n'))
    text = text.strip(b('\n')) + b('\n')
    try:
        text = text.decode('utf-8')
        if text.startswith(u'\ufeff'):
            text = text[len(u'\ufeff'):]
    except UnicodeError:
        text = text.decode('latin1')
    ntext = []
    tokens = []
    for type, val in lx.get_tokens(text):
        ntext.append(val)
        assert type != Error, \
            'lexer %s generated error token for %s: %r at position %d' % \
            (lx, absfn, val, len(u''.join(ntext)))
        tokens.append((type, val))
    if u''.join(ntext) != text:
        print '\n'.join(
            difflib.unified_diff(u''.join(ntext).splitlines(),
                                 text.splitlines()))
        raise AssertionError('round trip failed for ' + absfn)

    # check output against previous run if enabled
    if STORE_OUTPUT:
        # no previous output -- store it
        if not os.path.isfile(outfn):
            fp = open(outfn, 'wb')
            try:
                pickle.dump(tokens, fp)
            finally:
                fp.close()
            return
        # otherwise load it and compare
        fp = open(outfn, 'rb')
        try:
            stored_tokens = pickle.load(fp)
        finally:
            fp.close()
        if stored_tokens != tokens:
            f1 = pprint.pformat(stored_tokens)
            f2 = pprint.pformat(tokens)
            print '\n'.join(
                difflib.unified_diff(f1.splitlines(), f2.splitlines()))
            assert False, absfn
Esempio n. 4
0
def check_lexer(lx, absfn):
    text = open(absfn, 'rb').read()
    text = text.strip(b('\n')) + b('\n')
    try:
        text = text.decode('utf-8')
    except UnicodeError:
        text = text.decode('latin1')
    ntext = []
    for type, val in lx.get_tokens(text):
        ntext.append(val)
        assert type != Error, 'lexer %s generated error token for %s' % \
                (lx, absfn)
    if u''.join(ntext) != text:
        raise AssertionError('round trip failed for ' + absfn)
Esempio n. 5
0
def check_lexer(lx, absfn, outfn):
    fp = open(absfn, 'rb')
    try:
        text = fp.read()
    finally:
        fp.close()
    text = text.replace(b('\r\n'), b('\n'))
    text = text.strip(b('\n')) + b('\n')
    try:
        text = text.decode('utf-8')
        if text.startswith(u'\ufeff'):
            text = text[len(u'\ufeff'):]
    except UnicodeError:
        text = text.decode('latin1')
    ntext = []
    tokens = []
    for type, val in lx.get_tokens(text):
        ntext.append(val)
        assert type != Error, \
            'lexer %s generated error token for %s: %r at position %d' % \
            (lx, absfn, val, len(u''.join(ntext)))
        tokens.append((type, val))
    if u''.join(ntext) != text:
        print '\n'.join(difflib.unified_diff(u''.join(ntext).splitlines(),
                                             text.splitlines()))
        raise AssertionError('round trip failed for ' + absfn)

    # check output against previous run if enabled
    if STORE_OUTPUT:
        # no previous output -- store it
        if not os.path.isfile(outfn):
            fp = open(outfn, 'wb')
            try:
                pickle.dump(tokens, fp)
            finally:
                fp.close()
            return
        # otherwise load it and compare
        fp = open(outfn, 'rb')
        try:
            stored_tokens = pickle.load(fp)
        finally:
            fp.close()
        if stored_tokens != tokens:
            f1 = pprint.pformat(stored_tokens)
            f2 = pprint.pformat(tokens)
            print '\n'.join(difflib.unified_diff(f1.splitlines(),
                                                 f2.splitlines()))
            assert False, absfn
Esempio n. 6
0
def check_lexer(lx, absfn):
    text = open(absfn, 'rb').read()
    text = text.replace(b('\r\n'), b('\n'))
    text = text.strip(b('\n')) + b('\n')
    try:
        text = text.decode('utf-8')
    except UnicodeError:
        text = text.decode('latin1')
    ntext = []
    for type, val in lx.get_tokens(text):
        ntext.append(val)
        assert type != Error, 'lexer %s generated error token for %s' % \
                (lx, absfn)
    if u''.join(ntext) != text:
        raise AssertionError('round trip failed for ' + absfn)
Esempio n. 7
0
    def get_tokens(self, text):
        if isinstance(text, str):
            # raw token stream never has any non-ASCII characters
            text = text.encode('ascii')
        if self.compress == 'gz':
            import gzip
            gzipfile = gzip.GzipFile('', 'rb', 9, io.StringIO(text))
            text = gzipfile.read()
        elif self.compress == 'bz2':
            import bz2
            text = bz2.decompress(text)

        # do not call Lexer.get_tokens() because we do not want Unicode
        # decoding to occur, and stripping is not optional.
        text = text.strip(b('\n')) + b('\n')
        for i, t, v in self.get_tokens_unprocessed(text):
            yield t, v
Esempio n. 8
0
    def get_tokens(self, text):
        if isinstance(text, unicode):
            # raw token stream never has any non-ASCII characters
            text = text.encode('ascii')
        if self.compress == 'gz':
            import gzip
            gzipfile = gzip.GzipFile('', 'rb', 9, cStringIO.StringIO(text))
            text = gzipfile.read()
        elif self.compress == 'bz2':
            import bz2
            text = bz2.decompress(text)

        # do not call Lexer.get_tokens() because we do not want Unicode
        # decoding to occur, and stripping is not optional.
        text = text.strip(b('\n')) + b('\n')
        for i, t, v in self.get_tokens_unprocessed(text):
            yield t, v
Esempio n. 9
0
    def get_tokens(self, text):
        if isinstance(text, str):
            # raw token stream never has any non-ASCII characters
            text = text.encode("ascii")
        if self.compress == "gz":
            import gzip

            gzipfile = gzip.GzipFile("", "rb", 9, io.StringIO(text))
            text = gzipfile.read()
        elif self.compress == "bz2":
            import bz2

            text = bz2.decompress(text)

        # do not call Lexer.get_tokens() because we do not want Unicode
        # decoding to occur, and stripping is not optional.
        text = text.strip(b("\n")) + b("\n")
        for i, t, v in self.get_tokens_unprocessed(text):
            yield t, v
Esempio n. 10
0
    def format(self, tokensource, outfile):
        try:
            outfile.write(b(""))
        except TypeError:
            raise TypeError("The raw tokens formatter needs a binary " "output file")
        if self.compress == "gz":
            import gzip

            outfile = gzip.GzipFile("", "wb", 9, outfile)

            def write(text):
                outfile.write(text.encode())

            flush = outfile.flush
        elif self.compress == "bz2":
            import bz2

            compressor = bz2.BZ2Compressor(9)

            def write(text):
                outfile.write(compressor.compress(text.encode()))

            def flush():
                outfile.write(compressor.flush())
                outfile.flush()

        else:

            def write(text):
                outfile.write(text.encode())

            flush = outfile.flush

        lasttype = None
        lastval = u""
        if self.error_color:
            for ttype, value in tokensource:
                line = "%s\t%r\n" % (ttype, value)
                if ttype is Token.Error:
                    write(colorize(self.error_color, line))
                else:
                    write(line)
        else:
            for ttype, value in tokensource:
                write("%s\t%r\n" % (ttype, value))
        flush()
Esempio n. 11
0
    def format(self, tokensource, outfile):
        try:
            outfile.write(b(''))
        except TypeError:
            raise TypeError('The raw tokens formatter needs a binary '
                            'output file')
        if self.compress == 'gz':
            import gzip
            outfile = gzip.GzipFile('', 'wb', 9, outfile)

            def write(text):
                outfile.write(text.encode())

            flush = outfile.flush
        elif self.compress == 'bz2':
            import bz2
            compressor = bz2.BZ2Compressor(9)

            def write(text):
                outfile.write(compressor.compress(text.encode()))

            def flush():
                outfile.write(compressor.flush())
                outfile.flush()
        else:

            def write(text):
                outfile.write(text.encode())

            flush = outfile.flush

        lasttype = None
        lastval = u''
        if self.error_color:
            for ttype, value in tokensource:
                line = "%s\t%r\n" % (ttype, value)
                if ttype is Token.Error:
                    write(colorize(self.error_color, line))
                else:
                    write(line)
        else:
            for ttype, value in tokensource:
                write("%s\t%r\n" % (ttype, value))
        flush()
Esempio n. 12
0
 def get_tokens_unprocessed(self, text):
     length = 0
     for match in line_re.finditer(text):
         try:
             ttypestr, val = match.group().split(b('\t'), 1)
         except ValueError:
             val = match.group().decode(self.encoding)
             ttype = Error
         else:
             ttype = _ttype_cache.get(ttypestr)
             if not ttype:
                 ttype = Token
                 ttypes = ttypestr.split('.')[1:]
                 for ttype_ in ttypes:
                     if not ttype_ or not ttype_[0].isupper():
                         raise ValueError('malformed token name')
                     ttype = getattr(ttype, ttype_)
                 _ttype_cache[ttypestr] = ttype
             val = val[2:-2].decode('unicode-escape')
         yield length, ttype, val
         length += len(val)
Esempio n. 13
0
 def get_tokens_unprocessed(self, text):
     length = 0
     for match in line_re.finditer(text):
         try:
             ttypestr, val = match.group().split(b('\t'), 1)
         except ValueError:
             val = match.group().decode(self.encoding)
             ttype = Error
         else:
             ttype = _ttype_cache.get(ttypestr)
             if not ttype:
                 ttype = Token
                 ttypes = ttypestr.split('.')[1:]
                 for ttype_ in ttypes:
                     if not ttype_ or not ttype_[0].isupper():
                         raise ValueError('malformed token name')
                     ttype = getattr(ttype, ttype_)
                 _ttype_cache[ttypestr] = ttype
             val = val[2:-2].decode('unicode-escape')
         yield length, ttype, val
         length += len(val)
Esempio n. 14
0
    def format(self, tokensource, outfile):
        try:
            outfile.write(b(''))
        except TypeError:
            raise TypeError('The raw tokens formatter needs a binary '
                            'output file')
        if self.compress == 'gz':
            import gzip
            outfile = gzip.GzipFile('', 'wb', 9, outfile)
            def write(text):
                outfile.write(text.encode())
            flush = outfile.flush
        elif self.compress == 'bz2':
            import bz2
            compressor = bz2.BZ2Compressor(9)
            def write(text):
                outfile.write(compressor.compress(text.encode()))
            def flush():
                outfile.write(compressor.flush())
                outfile.flush()
        else:
            def write(text):
                outfile.write(text.encode())
            flush = outfile.flush

        if self.error_color:
            for ttype, value in tokensource:
                line = "%s\t%r\n" % (ttype, value)
                if ttype is Token.Error:
                    write(colorize(self.error_color, line))
                else:
                    write(line)
        else:
            for ttype, value in tokensource:
                write("%s\t%r\n" % (ttype, value))
        flush()
Esempio n. 15
0
class TextLexer(Lexer):
    """
    "Null" lexer, doesn't highlight anything.
    """
    name = 'Text only'
    aliases = ['text']
    filenames = ['*.txt']
    mimetypes = ['text/plain']

    def get_tokens_unprocessed(self, text):
        yield 0, Text, text


_ttype_cache = {}

line_re = re.compile(b('.*?\n'))

class RawTokenLexer(Lexer):
    """
    Recreate a token stream formatted with the `RawTokenFormatter`.  This
    lexer raises exceptions during parsing if the token stream in the
    file is malformed.

    Additional options accepted:

    `compress`
        If set to ``"gz"`` or ``"bz2"``, decompress the token stream with
        the given compression algorithm before lexing (default: ``""``).
    """
    name = 'Raw token data'
    aliases = ['raw']
Esempio n. 16
0
class TextLexer(Lexer):
    """
    "Null" lexer, doesn't highlight anything.
    """
    name = 'Text only'
    aliases = ['text']
    filenames = ['*.txt']
    mimetypes = ['text/plain']

    def get_tokens_unprocessed(self, text):
        yield 0, Text, text


_ttype_cache = {}

line_re = re.compile(b('.*?\n'))


class RawTokenLexer(Lexer):
    """
    Recreate a token stream formatted with the `RawTokenFormatter`.  This
    lexer raises exceptions during parsing if the token stream in the
    file is malformed.

    Additional options accepted:

    `compress`
        If set to ``"gz"`` or ``"bz2"``, decompress the token stream with
        the given compression algorithm before lexing (default: ``""``).
    """
    name = 'Raw token data'
Esempio n. 17
0
    """
    "Null" lexer, doesn't highlight anything.
    """

    name = "Text only"
    aliases = ["text"]
    filenames = ["*.txt"]
    mimetypes = ["text/plain"]

    def get_tokens_unprocessed(self, text):
        yield 0, Text, text


_ttype_cache = {}

line_re = re.compile(b(".*?\n"))


class RawTokenLexer(Lexer):
    """
    Recreate a token stream formatted with the `RawTokenFormatter`.  This
    lexer raises exceptions during parsing if the token stream in the
    file is malformed.

    Additional options accepted:

    `compress`
        If set to ``"gz"`` or ``"bz2"``, decompress the token stream with
        the given compression algorithm before lexing (default: ``""``).
    """