def test_lexer_options(): # test that the basic options work def ensure(tokens, output): concatenated = ''.join(token[1] for token in tokens) assert concatenated == output, \ '%s: %r != %r' % (lexer, concatenated, output) def verify(cls): inst = cls(stripnl=False) ensure(inst.get_tokens('a\nb'), 'a\nb\n') ensure(inst.get_tokens('\n\n\n'), '\n\n\n') inst = cls(stripall=True) ensure(inst.get_tokens(' \n b\n\n\n'), 'b\n') # some lexers require full lines in input if ('ConsoleLexer' not in cls.__name__ and 'SessionLexer' not in cls.__name__ and not cls.__name__.startswith('Literate') and cls.__name__ not in ('ErlangShellLexer', 'RobotFrameworkLexer')): inst = cls(ensurenl=False) ensure(inst.get_tokens('a\nb'), 'a\nb') inst = cls(ensurenl=False, stripall=True) ensure(inst.get_tokens('a\nb\n\n'), 'a\nb') for lexer in lexers._iter_lexerclasses(plugins=False): if lexer.__name__ == 'RawTokenLexer': # this one is special continue yield verify, lexer
def test_lexer_options(): # test that the basic options work def ensure(tokens, output): concatenated = ''.join(token[1] for token in tokens) assert concatenated == output, \ '%s: %r != %r' % (lexer, concatenated, output) def verify(cls): inst = cls(stripnl=False) ensure(inst.get_tokens('a\nb'), 'a\nb\n') ensure(inst.get_tokens('\n\n\n'), '\n\n\n') inst = cls(stripall=True) ensure(inst.get_tokens(' \n b\n\n\n'), 'b\n') # some lexers require full lines in input if cls.__name__ not in ('PythonConsoleLexer', 'RConsoleLexer', 'RubyConsoleLexer', 'SqliteConsoleLexer', 'MatlabSessionLexer', 'ErlangShellLexer', 'BashSessionLexer', 'LiterateHaskellLexer', 'LiterateAgdaLexer', 'PostgresConsoleLexer', 'ElixirConsoleLexer', 'JuliaConsoleLexer', 'RobotFrameworkLexer', 'DylanConsoleLexer', 'ShellSessionLexer', 'LiterateIdrisLexer', 'LiterateCryptolLexer'): inst = cls(ensurenl=False) ensure(inst.get_tokens('a\nb'), 'a\nb') inst = cls(ensurenl=False, stripall=True) ensure(inst.get_tokens('a\nb\n\n'), 'a\nb') for lexer in lexers._iter_lexerclasses(plugins=False): if lexer.__name__ == 'RawTokenLexer': # this one is special continue yield verify, lexer
def test_lexer_options(): # test that the basic options work def ensure(tokens, output): concatenated = ''.join(token[1] for token in tokens) assert concatenated == output, \ '%s: %r != %r' % (lexer, concatenated, output) def verify(cls): inst = cls(stripnl=False) ensure(inst.get_tokens('a\nb'), 'a\nb\n') ensure(inst.get_tokens('\n\n\n'), '\n\n\n') inst = cls(stripall=True) ensure(inst.get_tokens(' \n b\n\n\n'), 'b\n') # some lexers require full lines in input if cls.__name__ not in ( 'PythonConsoleLexer', 'RConsoleLexer', 'RubyConsoleLexer', 'SqliteConsoleLexer', 'MatlabSessionLexer', 'ErlangShellLexer', 'BashSessionLexer', 'LiterateHaskellLexer', 'LiterateAgdaLexer', 'PostgresConsoleLexer', 'ElixirConsoleLexer', 'JuliaConsoleLexer', 'RobotFrameworkLexer', 'DylanConsoleLexer', 'ShellSessionLexer', 'LiterateIdrisLexer', 'LiterateCryptolLexer'): inst = cls(ensurenl=False) ensure(inst.get_tokens('a\nb'), 'a\nb') inst = cls(ensurenl=False, stripall=True) ensure(inst.get_tokens('a\nb\n\n'), 'a\nb') for lexer in lexers._iter_lexerclasses(plugins=False): if lexer.__name__ == 'RawTokenLexer': # this one is special continue yield verify, lexer
def test_lexer_classes(self): a = self.assert_ ae = self.assertEquals # test that every lexer class has the correct public API for lexer in lexers._iter_lexerclasses(): a(type(lexer.name) is str) for attr in 'aliases', 'filenames', 'alias_filenames', 'mimetypes': a(hasattr(lexer, attr)) a(type(getattr(lexer, attr)) is list, "%s: %s attribute wrong" % (lexer, attr)) result = lexer.analyse_text("abc") a(isinstance(result, float) and 0.0 <= result <= 1.0) inst = lexer(opt1="val1", opt2="val2") if issubclass(lexer, RegexLexer): if not hasattr(lexer, '_tokens'): # if there's no "_tokens", the lexer has to be one with # multiple tokendef variants a(lexer.token_variants) for variant in lexer.tokens: a('root' in lexer.tokens[variant]) else: a('root' in lexer._tokens, '%s has no root state' % lexer) tokens = list(inst.get_tokens(test_content)) txt = "" for token in tokens: a(isinstance(token, tuple)) a(isinstance(token[0], _TokenType)) if isinstance(token[1], str): print repr(token[1]) a(isinstance(token[1], unicode)) txt += token[1] ae(txt, test_content, "%s lexer roundtrip failed: %r != %r" % (lexer.name, test_content, txt))
def test_lexer_options(): # test that the basic options work def ensure(tokens, output): concatenated = "".join(token[1] for token in tokens) assert concatenated == output, "%s: %r != %r" % (lexer, concatenated, output) def verify(cls): inst = cls(stripnl=False) ensure(inst.get_tokens("a\nb"), "a\nb\n") ensure(inst.get_tokens("\n\n\n"), "\n\n\n") inst = cls(stripall=True) ensure(inst.get_tokens(" \n b\n\n\n"), "b\n") # some lexers require full lines in input if cls.__name__ not in ( "PythonConsoleLexer", "RConsoleLexer", "RubyConsoleLexer", "SqliteConsoleLexer", "MatlabSessionLexer", "ErlangShellLexer", "BashSessionLexer", "LiterateHaskellLexer", ): inst = cls(ensurenl=False) ensure(inst.get_tokens("a\nb"), "a\nb") inst = cls(ensurenl=False, stripall=True) ensure(inst.get_tokens("a\nb\n\n"), "a\nb") for lexer in lexers._iter_lexerclasses(): if lexer.__name__ == "RawTokenLexer": # this one is special continue yield verify, lexer
def guess_lexer(_text, conf_threshold=0.01, mime_mul=0.09, **options): """Guess a lexer by strong distinctions in the text (eg, shebang).""" # try to get a vim modeline first ft = get_filetype_from_buffer(_text) if ft is not None: try: return get_lexer_by_name(ft, **options) except ClassNotFound: pass best_lexer = [0.0, None] for lexer in _iter_lexerclasses(): rv = lexer.analyse_text(_text) #MIME has an unusually high rv when providing it with any kind of text if 'MIME' in str(lexer): rv *= mime_mul if rv == 1.0: return lexer(**options) if rv > best_lexer[0]: best_lexer[:] = (rv, lexer) if not best_lexer[ 0] or best_lexer[0] < conf_threshold or best_lexer[1] is None: raise ClassNotFound('no lexer matching the text found') return best_lexer[1](**options)
def test_lexer_classes(): # test that every lexer class has the correct public API def verify(cls): assert type(cls.name) is str for attr in 'aliases', 'filenames', 'alias_filenames', 'mimetypes': assert hasattr(cls, attr) assert type(getattr(cls, attr)) is list, \ "%s: %s attribute wrong" % (cls, attr) result = cls.analyse_text("abc") assert isinstance(result, float) and 0.0 <= result <= 1.0 result = cls.analyse_text(".abc") assert isinstance(result, float) and 0.0 <= result <= 1.0 assert all(al.lower() == al for al in cls.aliases) inst = cls(opt1="val1", opt2="val2") if issubclass(cls, RegexLexer): if not hasattr(cls, '_tokens'): # if there's no "_tokens", the lexer has to be one with # multiple tokendef variants assert cls.token_variants for variant in cls.tokens: assert 'root' in cls.tokens[variant] else: assert 'root' in cls._tokens, \ '%s has no root state' % cls if cls.name in ['XQuery', 'Opa']: # XXX temporary return try: tokens = list(inst.get_tokens(test_content)) except KeyboardInterrupt: raise KeyboardInterrupt( 'interrupted %s.get_tokens(): test_content=%r' % (cls.__name__, test_content)) txt = "" for token in tokens: assert isinstance(token, tuple) assert isinstance(token[0], _TokenType) if isinstance(token[1], str): print(repr(token[1])) assert isinstance(token[1], text_type) txt += token[1] assert txt == test_content, "%s lexer roundtrip failed: %r != %r" % \ (cls.name, test_content, txt) for lexer in lexers._iter_lexerclasses(): yield verify, lexer
def test_lexer_classes(): # test that every lexer class has the correct public API def verify(cls): assert type(cls.name) is str for attr in 'aliases', 'filenames', 'alias_filenames', 'mimetypes': assert hasattr(cls, attr) assert type(getattr(cls, attr)) is list, \ "%s: %s attribute wrong" % (cls, attr) result = cls.analyse_text("abc") assert isinstance(result, float) and 0.0 <= result <= 1.0 result = cls.analyse_text(".abc") assert isinstance(result, float) and 0.0 <= result <= 1.0 assert all(al.lower() == al for al in cls.aliases) inst = cls(opt1="val1", opt2="val2") if issubclass(cls, RegexLexer): if not hasattr(cls, '_tokens'): # if there's no "_tokens", the lexer has to be one with # multiple tokendef variants assert cls.token_variants for variant in cls.tokens: assert 'root' in cls.tokens[variant] else: assert 'root' in cls._tokens, \ '%s has no root state' % cls if cls.name in ['XQuery', 'Opa']: # XXX temporary return try: tokens = list(inst.get_tokens(test_content)) except KeyboardInterrupt: raise KeyboardInterrupt('interrupted %s.get_tokens(): test_content=%r' % (cls.__name__, test_content)) txt = "" for token in tokens: assert isinstance(token, tuple) assert isinstance(token[0], _TokenType) if isinstance(token[1], str): print(repr(token[1])) assert isinstance(token[1], text_type) txt += token[1] assert txt == test_content, "%s lexer roundtrip failed: %r != %r" % \ (cls.name, test_content, txt) for lexer in lexers._iter_lexerclasses(plugins=False): yield verify, lexer
def guess_lexer_for_filename(_fn, _text, **options): """ Ripped from the tip of pygments It is a version of this that supports python 2 and 3. The 1.6 version has a python3 bug this resolves """ # todo - When pygments releases a new version this should be removed. fn = basename(_fn) primary = None matching_lexers = set() for lexer in _iter_lexerclasses(): for filename in lexer.filenames: if fnmatch.fnmatch(fn, filename): matching_lexers.add(lexer) primary = lexer for filename in lexer.alias_filenames: if fnmatch.fnmatch(fn, filename): matching_lexers.add(lexer) if not matching_lexers: raise ClassNotFound('no lexer for filename %r found' % fn) if len(matching_lexers) == 1: return matching_lexers.pop()(**options) result = [] for lexer in matching_lexers: rv = lexer.analyse_text(_text) if rv == 1.0: return lexer(**options) result.append((rv, lexer)) # since py3 can no longer sort by class name by default, here is the # sorting function that works in both def type_sort(type_): return (type_[0], type_[1].__name__) result.sort(key=type_sort) if not result[-1][0] and primary is not None: return primary(**options) return result[-1][1](**options)
def test_lexer_classes(): # test that every lexer class has the correct public API def verify(cls): assert type(cls.name) is str for attr in "aliases", "filenames", "alias_filenames", "mimetypes": assert hasattr(cls, attr) assert type(getattr(cls, attr)) is list, "%s: %s attribute wrong" % (cls, attr) result = cls.analyse_text("abc") assert isinstance(result, float) and 0.0 <= result <= 1.0 result = cls.analyse_text(".abc") assert isinstance(result, float) and 0.0 <= result <= 1.0 inst = cls(opt1="val1", opt2="val2") if issubclass(cls, RegexLexer): if not hasattr(cls, "_tokens"): # if there's no "_tokens", the lexer has to be one with # multiple tokendef variants assert cls.token_variants for variant in cls.tokens: assert "root" in cls.tokens[variant] else: assert "root" in cls._tokens, "%s has no root state" % cls if cls.name in ["XQuery", "Opa"]: # XXX temporary return tokens = list(inst.get_tokens(test_content)) txt = "" for token in tokens: assert isinstance(token, tuple) assert isinstance(token[0], _TokenType) if isinstance(token[1], str): print repr(token[1]) assert isinstance(token[1], unicode) txt += token[1] assert txt == test_content, "%s lexer roundtrip failed: %r != %r" % (cls.name, test_content, txt) for lexer in lexers._iter_lexerclasses(): yield verify, lexer
def find_best_lexer(text, min_confidence=0.85): """ Like the built in pygments guess_lexer, except has a minimum confidence level. If that is not met, it falls back to plain text to avoid bad highlighting. :returns: Lexer instance """ current_best_confidence = 0.0 current_best_lexer = None for lexer in _iter_lexerclasses(): confidence = lexer.analyse_text(text) if confidence == 1.0: return lexer() elif confidence > current_best_confidence: current_best_confidence = confidence current_best_lexer = lexer if current_best_confidence >= min_confidence: return current_best_lexer() else: return TextLexer()
def test_lexer_classes(): # test that every lexer class has the correct public API def verify(cls): assert type(cls.name) is str for attr in 'aliases', 'filenames', 'alias_filenames', 'mimetypes': assert hasattr(cls, attr) assert type(getattr(cls, attr)) is list, \ "%s: %s attribute wrong" % (cls, attr) result = cls.analyse_text("abc") assert isinstance(result, float) and 0.0 <= result <= 1.0 inst = cls(opt1="val1", opt2="val2") if issubclass(cls, RegexLexer): if not hasattr(cls, '_tokens'): # if there's no "_tokens", the lexer has to be one with # multiple tokendef variants assert cls.token_variants for variant in cls.tokens: assert 'root' in cls.tokens[variant] else: assert 'root' in cls._tokens, \ '%s has no root state' % cls tokens = list(inst.get_tokens(test_content)) txt = "" for token in tokens: assert isinstance(token, tuple) assert isinstance(token[0], _TokenType) if isinstance(token[1], str): print repr(token[1]) assert isinstance(token[1], unicode) txt += token[1] assert txt == test_content, "%s lexer roundtrip failed: %r != %r" % \ (cls.name, test_content, txt) for lexer in lexers._iter_lexerclasses(): yield verify, lexer
TESTDIR = path.dirname(path.abspath(__file__)) TESTFILE = path.join(TESTDIR, 'test_basic_api.py') test_content = [chr(i) for i in range(33, 128)] * 5 random.shuffle(test_content) test_content = ''.join(test_content) + '\n' @pytest.mark.parametrize('name', lexers.LEXERS) def test_lexer_instantiate_all(name): # instantiate every lexer, to see if the token type defs are correct getattr(lexers, name) @pytest.mark.parametrize('cls', lexers._iter_lexerclasses(plugins=False)) def test_lexer_classes(cls): # test that every lexer class has the correct public API assert type(cls.name) is str for attr in 'aliases', 'filenames', 'alias_filenames', 'mimetypes': assert hasattr(cls, attr) assert type(getattr(cls, attr)) is list, \ "%s: %s attribute wrong" % (cls, attr) result = cls.analyse_text("abc") assert isinstance(result, float) and 0.0 <= result <= 1.0 result = cls.analyse_text(".abc") assert isinstance(result, float) and 0.0 <= result <= 1.0 assert all(al.lower() == al for al in cls.aliases) if issubclass(cls, RegexLexer):
def find_lexers(data): for l in lexers._iter_lexerclasses(): print l print highlight(data, l(), formatters.TerminalFormatter())