def test_lexer_options():
    # test that the basic options work
    def ensure(tokens, output):
        concatenated = ''.join(token[1] for token in tokens)
        assert concatenated == output, \
            '%s: %r != %r' % (lexer, concatenated, output)

    def verify(cls):
        inst = cls(stripnl=False)
        ensure(inst.get_tokens('a\nb'), 'a\nb\n')
        ensure(inst.get_tokens('\n\n\n'), '\n\n\n')
        inst = cls(stripall=True)
        ensure(inst.get_tokens('   \n  b\n\n\n'), 'b\n')
        # some lexers require full lines in input
        if ('ConsoleLexer' not in cls.__name__
                and 'SessionLexer' not in cls.__name__
                and not cls.__name__.startswith('Literate') and cls.__name__
                not in ('ErlangShellLexer', 'RobotFrameworkLexer')):
            inst = cls(ensurenl=False)
            ensure(inst.get_tokens('a\nb'), 'a\nb')
            inst = cls(ensurenl=False, stripall=True)
            ensure(inst.get_tokens('a\nb\n\n'), 'a\nb')

    for lexer in lexers._iter_lexerclasses(plugins=False):
        if lexer.__name__ == 'RawTokenLexer':
            # this one is special
            continue
        yield verify, lexer
def test_lexer_options():
    # test that the basic options work
    def ensure(tokens, output):
        concatenated = ''.join(token[1] for token in tokens)
        assert concatenated == output, \
            '%s: %r != %r' % (lexer, concatenated, output)

    def verify(cls):
        inst = cls(stripnl=False)
        ensure(inst.get_tokens('a\nb'), 'a\nb\n')
        ensure(inst.get_tokens('\n\n\n'), '\n\n\n')
        inst = cls(stripall=True)
        ensure(inst.get_tokens('   \n  b\n\n\n'), 'b\n')
        # some lexers require full lines in input
        if cls.__name__ not in ('PythonConsoleLexer', 'RConsoleLexer',
                                'RubyConsoleLexer', 'SqliteConsoleLexer',
                                'MatlabSessionLexer', 'ErlangShellLexer',
                                'BashSessionLexer', 'LiterateHaskellLexer',
                                'LiterateAgdaLexer', 'PostgresConsoleLexer',
                                'ElixirConsoleLexer', 'JuliaConsoleLexer',
                                'RobotFrameworkLexer', 'DylanConsoleLexer',
                                'ShellSessionLexer', 'LiterateIdrisLexer',
                                'LiterateCryptolLexer'):
            inst = cls(ensurenl=False)
            ensure(inst.get_tokens('a\nb'), 'a\nb')
            inst = cls(ensurenl=False, stripall=True)
            ensure(inst.get_tokens('a\nb\n\n'), 'a\nb')

    for lexer in lexers._iter_lexerclasses(plugins=False):
        if lexer.__name__ == 'RawTokenLexer':
            # this one is special
            continue
        yield verify, lexer
def test_lexer_options():
    # test that the basic options work
    def ensure(tokens, output):
        concatenated = ''.join(token[1] for token in tokens)
        assert concatenated == output, \
            '%s: %r != %r' % (lexer, concatenated, output)

    def verify(cls):
        inst = cls(stripnl=False)
        ensure(inst.get_tokens('a\nb'), 'a\nb\n')
        ensure(inst.get_tokens('\n\n\n'), '\n\n\n')
        inst = cls(stripall=True)
        ensure(inst.get_tokens('   \n  b\n\n\n'), 'b\n')
        # some lexers require full lines in input
        if cls.__name__ not in (
                'PythonConsoleLexer', 'RConsoleLexer', 'RubyConsoleLexer',
                'SqliteConsoleLexer', 'MatlabSessionLexer', 'ErlangShellLexer',
                'BashSessionLexer', 'LiterateHaskellLexer', 'LiterateAgdaLexer',
                'PostgresConsoleLexer', 'ElixirConsoleLexer', 'JuliaConsoleLexer',
                'RobotFrameworkLexer', 'DylanConsoleLexer', 'ShellSessionLexer',
                'LiterateIdrisLexer', 'LiterateCryptolLexer'):
            inst = cls(ensurenl=False)
            ensure(inst.get_tokens('a\nb'), 'a\nb')
            inst = cls(ensurenl=False, stripall=True)
            ensure(inst.get_tokens('a\nb\n\n'), 'a\nb')

    for lexer in lexers._iter_lexerclasses(plugins=False):
        if lexer.__name__ == 'RawTokenLexer':
            # this one is special
            continue
        yield verify, lexer
Exemple #4
0
    def test_lexer_classes(self):
        a = self.assert_
        ae = self.assertEquals
        # test that every lexer class has the correct public API
        for lexer in lexers._iter_lexerclasses():
            a(type(lexer.name) is str)
            for attr in 'aliases', 'filenames', 'alias_filenames', 'mimetypes':
                a(hasattr(lexer, attr))
                a(type(getattr(lexer, attr)) is list, "%s: %s attribute wrong" %
                                                      (lexer, attr))
            result = lexer.analyse_text("abc")
            a(isinstance(result, float) and 0.0 <= result <= 1.0)

            inst = lexer(opt1="val1", opt2="val2")
            if issubclass(lexer, RegexLexer):
                if not hasattr(lexer, '_tokens'):
                    # if there's no "_tokens", the lexer has to be one with
                    # multiple tokendef variants
                    a(lexer.token_variants)
                    for variant in lexer.tokens:
                        a('root' in lexer.tokens[variant])
                else:
                    a('root' in lexer._tokens, '%s has no root state' % lexer)

            tokens = list(inst.get_tokens(test_content))
            txt = ""
            for token in tokens:
                a(isinstance(token, tuple))
                a(isinstance(token[0], _TokenType))
                if isinstance(token[1], str):
                    print repr(token[1])
                a(isinstance(token[1], unicode))
                txt += token[1]
            ae(txt, test_content, "%s lexer roundtrip failed: %r != %r" %
                    (lexer.name, test_content, txt))
Exemple #5
0
def test_lexer_options():
    # test that the basic options work
    def ensure(tokens, output):
        concatenated = ''.join(token[1] for token in tokens)
        assert concatenated == output, \
            '%s: %r != %r' % (lexer, concatenated, output)

    def verify(cls):
        inst = cls(stripnl=False)
        ensure(inst.get_tokens('a\nb'), 'a\nb\n')
        ensure(inst.get_tokens('\n\n\n'), '\n\n\n')
        inst = cls(stripall=True)
        ensure(inst.get_tokens('   \n  b\n\n\n'), 'b\n')
        # some lexers require full lines in input
        if ('ConsoleLexer' not in cls.__name__ and
            'SessionLexer' not in cls.__name__ and
            not cls.__name__.startswith('Literate') and
            cls.__name__ not in ('ErlangShellLexer', 'RobotFrameworkLexer')):
            inst = cls(ensurenl=False)
            ensure(inst.get_tokens('a\nb'), 'a\nb')
            inst = cls(ensurenl=False, stripall=True)
            ensure(inst.get_tokens('a\nb\n\n'), 'a\nb')

    for lexer in lexers._iter_lexerclasses(plugins=False):
        if lexer.__name__ == 'RawTokenLexer':
            # this one is special
            continue
        yield verify, lexer
Exemple #6
0
def test_lexer_options():
    # test that the basic options work
    def ensure(tokens, output):
        concatenated = "".join(token[1] for token in tokens)
        assert concatenated == output, "%s: %r != %r" % (lexer, concatenated, output)

    def verify(cls):
        inst = cls(stripnl=False)
        ensure(inst.get_tokens("a\nb"), "a\nb\n")
        ensure(inst.get_tokens("\n\n\n"), "\n\n\n")
        inst = cls(stripall=True)
        ensure(inst.get_tokens("   \n  b\n\n\n"), "b\n")
        # some lexers require full lines in input
        if cls.__name__ not in (
            "PythonConsoleLexer",
            "RConsoleLexer",
            "RubyConsoleLexer",
            "SqliteConsoleLexer",
            "MatlabSessionLexer",
            "ErlangShellLexer",
            "BashSessionLexer",
            "LiterateHaskellLexer",
        ):
            inst = cls(ensurenl=False)
            ensure(inst.get_tokens("a\nb"), "a\nb")
            inst = cls(ensurenl=False, stripall=True)
            ensure(inst.get_tokens("a\nb\n\n"), "a\nb")

    for lexer in lexers._iter_lexerclasses():
        if lexer.__name__ == "RawTokenLexer":
            # this one is special
            continue
        yield verify, lexer
Exemple #7
0
def guess_lexer(_text, conf_threshold=0.01, mime_mul=0.09, **options):
    """Guess a lexer by strong distinctions in the text (eg, shebang)."""

    # try to get a vim modeline first
    ft = get_filetype_from_buffer(_text)

    if ft is not None:
        try:
            return get_lexer_by_name(ft, **options)
        except ClassNotFound:
            pass

    best_lexer = [0.0, None]
    for lexer in _iter_lexerclasses():
        rv = lexer.analyse_text(_text)

        #MIME has an unusually high rv when providing it with any kind of text
        if 'MIME' in str(lexer):
            rv *= mime_mul

        if rv == 1.0:
            return lexer(**options)
        if rv > best_lexer[0]:
            best_lexer[:] = (rv, lexer)
    if not best_lexer[
            0] or best_lexer[0] < conf_threshold or best_lexer[1] is None:
        raise ClassNotFound('no lexer matching the text found')
    return best_lexer[1](**options)
Exemple #8
0
def test_lexer_classes():
    # test that every lexer class has the correct public API
    def verify(cls):
        assert type(cls.name) is str
        for attr in 'aliases', 'filenames', 'alias_filenames', 'mimetypes':
            assert hasattr(cls, attr)
            assert type(getattr(cls, attr)) is list, \
                   "%s: %s attribute wrong" % (cls, attr)
        result = cls.analyse_text("abc")
        assert isinstance(result, float) and 0.0 <= result <= 1.0
        result = cls.analyse_text(".abc")
        assert isinstance(result, float) and 0.0 <= result <= 1.0

        assert all(al.lower() == al for al in cls.aliases)

        inst = cls(opt1="val1", opt2="val2")
        if issubclass(cls, RegexLexer):
            if not hasattr(cls, '_tokens'):
                # if there's no "_tokens", the lexer has to be one with
                # multiple tokendef variants
                assert cls.token_variants
                for variant in cls.tokens:
                    assert 'root' in cls.tokens[variant]
            else:
                assert 'root' in cls._tokens, \
                       '%s has no root state' % cls

        if cls.name in ['XQuery', 'Opa']:  # XXX temporary
            return

        try:
            tokens = list(inst.get_tokens(test_content))
        except KeyboardInterrupt:
            raise KeyboardInterrupt(
                'interrupted %s.get_tokens(): test_content=%r' %
                (cls.__name__, test_content))
        txt = ""
        for token in tokens:
            assert isinstance(token, tuple)
            assert isinstance(token[0], _TokenType)
            if isinstance(token[1], str):
                print(repr(token[1]))
            assert isinstance(token[1], text_type)
            txt += token[1]
        assert txt == test_content, "%s lexer roundtrip failed: %r != %r" % \
               (cls.name, test_content, txt)

    for lexer in lexers._iter_lexerclasses():
        yield verify, lexer
def test_lexer_classes():
    # test that every lexer class has the correct public API
    def verify(cls):
        assert type(cls.name) is str
        for attr in 'aliases', 'filenames', 'alias_filenames', 'mimetypes':
            assert hasattr(cls, attr)
            assert type(getattr(cls, attr)) is list, \
                "%s: %s attribute wrong" % (cls, attr)
        result = cls.analyse_text("abc")
        assert isinstance(result, float) and 0.0 <= result <= 1.0
        result = cls.analyse_text(".abc")
        assert isinstance(result, float) and 0.0 <= result <= 1.0

        assert all(al.lower() == al for al in cls.aliases)

        inst = cls(opt1="val1", opt2="val2")
        if issubclass(cls, RegexLexer):
            if not hasattr(cls, '_tokens'):
                # if there's no "_tokens", the lexer has to be one with
                # multiple tokendef variants
                assert cls.token_variants
                for variant in cls.tokens:
                    assert 'root' in cls.tokens[variant]
            else:
                assert 'root' in cls._tokens, \
                       '%s has no root state' % cls

        if cls.name in ['XQuery', 'Opa']:   # XXX temporary
            return

        try:
            tokens = list(inst.get_tokens(test_content))
        except KeyboardInterrupt:
            raise KeyboardInterrupt('interrupted %s.get_tokens(): test_content=%r' % (cls.__name__, test_content))
        txt = ""
        for token in tokens:
            assert isinstance(token, tuple)
            assert isinstance(token[0], _TokenType)
            if isinstance(token[1], str):
                print(repr(token[1]))
            assert isinstance(token[1], text_type)
            txt += token[1]
        assert txt == test_content, "%s lexer roundtrip failed: %r != %r" % \
            (cls.name, test_content, txt)

    for lexer in lexers._iter_lexerclasses(plugins=False):
        yield verify, lexer
Exemple #10
0
def guess_lexer_for_filename(_fn, _text, **options):
    """
    Ripped from the tip of pygments
    It is a version of this that supports python 2 and 3.
    The 1.6 version has a python3 bug this resolves
    """
    # todo - When pygments releases a new version this should be removed.
    fn = basename(_fn)
    primary = None
    matching_lexers = set()
    for lexer in _iter_lexerclasses():
        for filename in lexer.filenames:
            if fnmatch.fnmatch(fn, filename):
                matching_lexers.add(lexer)
                primary = lexer
        for filename in lexer.alias_filenames:
            if fnmatch.fnmatch(fn, filename):
                matching_lexers.add(lexer)
    if not matching_lexers:
        raise ClassNotFound('no lexer for filename %r found' % fn)
    if len(matching_lexers) == 1:
        return matching_lexers.pop()(**options)
    result = []
    for lexer in matching_lexers:
        rv = lexer.analyse_text(_text)
        if rv == 1.0:
            return lexer(**options)
        result.append((rv, lexer))

    # since py3 can no longer sort by class name by default, here is the
    # sorting function that works in both
    def type_sort(type_):
        return (type_[0], type_[1].__name__)

    result.sort(key=type_sort)

    if not result[-1][0] and primary is not None:
        return primary(**options)
    return result[-1][1](**options)
def test_lexer_classes():
    # test that every lexer class has the correct public API
    def verify(cls):
        assert type(cls.name) is str
        for attr in "aliases", "filenames", "alias_filenames", "mimetypes":
            assert hasattr(cls, attr)
            assert type(getattr(cls, attr)) is list, "%s: %s attribute wrong" % (cls, attr)
        result = cls.analyse_text("abc")
        assert isinstance(result, float) and 0.0 <= result <= 1.0
        result = cls.analyse_text(".abc")
        assert isinstance(result, float) and 0.0 <= result <= 1.0

        inst = cls(opt1="val1", opt2="val2")
        if issubclass(cls, RegexLexer):
            if not hasattr(cls, "_tokens"):
                # if there's no "_tokens", the lexer has to be one with
                # multiple tokendef variants
                assert cls.token_variants
                for variant in cls.tokens:
                    assert "root" in cls.tokens[variant]
            else:
                assert "root" in cls._tokens, "%s has no root state" % cls

        if cls.name in ["XQuery", "Opa"]:  # XXX temporary
            return

        tokens = list(inst.get_tokens(test_content))
        txt = ""
        for token in tokens:
            assert isinstance(token, tuple)
            assert isinstance(token[0], _TokenType)
            if isinstance(token[1], str):
                print repr(token[1])
            assert isinstance(token[1], unicode)
            txt += token[1]
        assert txt == test_content, "%s lexer roundtrip failed: %r != %r" % (cls.name, test_content, txt)

    for lexer in lexers._iter_lexerclasses():
        yield verify, lexer
Exemple #12
0
def find_best_lexer(text, min_confidence=0.85):
    """
    Like the built in pygments guess_lexer, except has a minimum confidence
    level.  If that is not met, it falls back to plain text to avoid bad
    highlighting.

    :returns: Lexer instance
    """
    current_best_confidence = 0.0
    current_best_lexer = None
    for lexer in _iter_lexerclasses():
        confidence = lexer.analyse_text(text)
        if confidence == 1.0:
            return lexer()
        elif confidence > current_best_confidence:
            current_best_confidence = confidence
            current_best_lexer = lexer

    if current_best_confidence >= min_confidence:
        return current_best_lexer()
    else:
        return TextLexer()
Exemple #13
0
def guess_lexer_for_filename(_fn, _text, **options):
    """
    Ripped from the tip of pygments
    It is a version of this that supports python 2 and 3.
    The 1.6 version has a python3 bug this resolves
    """
    # todo - When pygments releases a new version this should be removed.
    fn = basename(_fn)
    primary = None
    matching_lexers = set()
    for lexer in _iter_lexerclasses():
        for filename in lexer.filenames:
            if fnmatch.fnmatch(fn, filename):
                matching_lexers.add(lexer)
                primary = lexer
        for filename in lexer.alias_filenames:
            if fnmatch.fnmatch(fn, filename):
                matching_lexers.add(lexer)
    if not matching_lexers:
        raise ClassNotFound('no lexer for filename %r found' % fn)
    if len(matching_lexers) == 1:
        return matching_lexers.pop()(**options)
    result = []
    for lexer in matching_lexers:
        rv = lexer.analyse_text(_text)
        if rv == 1.0:
            return lexer(**options)
        result.append((rv, lexer))

    # since py3 can no longer sort by class name by default, here is the
    # sorting function that works in both
    def type_sort(type_):
        return (type_[0], type_[1].__name__)
    result.sort(key=type_sort)

    if not result[-1][0] and primary is not None:
        return primary(**options)
    return result[-1][1](**options)
Exemple #14
0
def test_lexer_classes():
    # test that every lexer class has the correct public API
    def verify(cls):
        assert type(cls.name) is str
        for attr in 'aliases', 'filenames', 'alias_filenames', 'mimetypes':
            assert hasattr(cls, attr)
            assert type(getattr(cls, attr)) is list, \
                   "%s: %s attribute wrong" % (cls, attr)
        result = cls.analyse_text("abc")
        assert isinstance(result, float) and 0.0 <= result <= 1.0

        inst = cls(opt1="val1", opt2="val2")
        if issubclass(cls, RegexLexer):
            if not hasattr(cls, '_tokens'):
                # if there's no "_tokens", the lexer has to be one with
                # multiple tokendef variants
                assert cls.token_variants
                for variant in cls.tokens:
                    assert 'root' in cls.tokens[variant]
            else:
                assert 'root' in cls._tokens, \
                       '%s has no root state' % cls

        tokens = list(inst.get_tokens(test_content))
        txt = ""
        for token in tokens:
            assert isinstance(token, tuple)
            assert isinstance(token[0], _TokenType)
            if isinstance(token[1], str):
                print repr(token[1])
            assert isinstance(token[1], unicode)
            txt += token[1]
        assert txt == test_content, "%s lexer roundtrip failed: %r != %r" % \
               (cls.name, test_content, txt)

    for lexer in lexers._iter_lexerclasses():
        yield verify, lexer
Exemple #15
0
TESTDIR = path.dirname(path.abspath(__file__))
TESTFILE = path.join(TESTDIR, 'test_basic_api.py')

test_content = [chr(i) for i in range(33, 128)] * 5
random.shuffle(test_content)
test_content = ''.join(test_content) + '\n'


@pytest.mark.parametrize('name', lexers.LEXERS)
def test_lexer_instantiate_all(name):
    # instantiate every lexer, to see if the token type defs are correct
    getattr(lexers, name)


@pytest.mark.parametrize('cls', lexers._iter_lexerclasses(plugins=False))
def test_lexer_classes(cls):
    # test that every lexer class has the correct public API
    assert type(cls.name) is str
    for attr in 'aliases', 'filenames', 'alias_filenames', 'mimetypes':
        assert hasattr(cls, attr)
        assert type(getattr(cls, attr)) is list, \
            "%s: %s attribute wrong" % (cls, attr)
    result = cls.analyse_text("abc")
    assert isinstance(result, float) and 0.0 <= result <= 1.0
    result = cls.analyse_text(".abc")
    assert isinstance(result, float) and 0.0 <= result <= 1.0

    assert all(al.lower() == al for al in cls.aliases)

    if issubclass(cls, RegexLexer):
Exemple #16
0
def find_lexers(data):
    for l in lexers._iter_lexerclasses():
        print l
        print highlight(data, l(), formatters.TerminalFormatter())