Exemple #1
0
    def createLexers(self):

        lex = {}
        lex['.c'] = CFamilyLexer()
        lex['.h'] = CFamilyLexer()
        lex['.cpp'] = CppLexer()
        lex['.hpp'] = CppLexer()
        lex['.css'] = CssLexer()
        lex['.sass'] = SassLexer()
        lex['.yaml'] = YamlLexer()
        lex['.yml'] = YamlLexer()
        lex['.json'] = JsonLexer()
        lex['.cs'] = CSharpLexer()
        lex['.fs'] = FSharpLexer()
        lex['.e'] = EiffelLexer()
        lex['.erl'] = ErlangLexer()
        lex['.hrl'] = ErlangLexer()
        lex['.es'] = ErlangLexer()
        lex['.f03'] = FortranLexer()
        lex['.f90'] = FortranLexer()
        lex['.F03'] = FortranLexer()
        lex['.F90'] = FortranLexer()
        lex['.go'] = GoLexer()
        lex['.hs'] = HaskellLexer()
        lex['.v'] = VerilogLexer()
        lex['.vhdl'] = VhdlLexer()
        lex['.vhd'] = VhdlLexer()
        lex['.html'] = HtmlLexer()
        lex['.htm'] = HtmlLexer()
        lex['.xhtml'] = HtmlLexer()
        lex['.xml'] = XmlLexer()
        lex['.js'] = JavascriptLexer()
        lex['.tex'] = TypeScriptLexer()
        lex['.coffee'] = CoffeeScriptLexer()
        lex['.java'] = JavaLexer()
        lex['.scala'] = ScalaLexer()
        lex['.kt'] = KotlinLexer()
        lex['.ktm'] = KotlinLexer()
        lex['.kts'] = KotlinLexer()
        lex['.lisp'] = CommonLispLexer()
        lex['make'] = MakefileLexer()
        lex['Make'] = MakefileLexer()
        lex['CMake'] = CMakeLexer()
        lex['cmake'] = CMakeLexer()
        lex['.m'] = MatlabLexer()
        lex['.mat'] = MatlabLexer()
        lex['.dpr'] = DelphiLexer()
        lex['.perl'] = PerlLexer()
        lex['.php'] = PhpLexer()
        lex['.pr'] = PrologLexer()
        lex['.py'] = Python3Lexer()
        lex['.rb'] = RubyLexer()
        lex['.sh'] = BashLexer()
        lex['.sql'] = MySqlLexer()
        lex['.mysql'] = MySqlLexer()
        lex['.tcl'] = TclLexer()
        lex['.awk'] = AwkLexer()

        return lex
Exemple #2
0
def hl(s, config, style_name=None):
    if style_name:
        style = get_style_by_name(style_name)
    else:
        style = get_style_by_name(config.style)
    # Don't try to highlight if the string already has escape sequences
    if '\033[' in s:
        return s
    else:
        # Sometimes highlight adds an extra newline, so we remove it
        return highlight(s, HaskellLexer(),
                         Terminal256Formatter(style=style)).strip()
Exemple #3
0
class PygmentsParser:
    LEXERS = {
        "Scala": ScalaLexer(),
        "Swift": SwiftLexer(),
        "Kotlin": KotlinLexer(),
        "Haskell": HaskellLexer()
    }

    TYPES = {
        "Scala": {pygments.token.Name, pygments.token.Keyword.Type},
        "Swift": {pygments.token.Name},
        "Kotlin": {pygments.token.Name},
        "Haskell": {pygments.token.Name, pygments.token.Keyword.Type}
    }

    @staticmethod
    def read_file(file: str) -> str:
        """
        Read the contents of the file.
        :param file: the path to the file.
        :return: the contents of the file.
        """
        with open(file) as fin:
            return fin.read()

    @staticmethod
    def get_tokens(file: str, lang: str) -> Counter:
        """
        Gather a Counter object of tokens in the file and their count.
        :param file: the path to the file.
        :param lang: the language of file.
        :return: a Counter object of items: token and count.
        """
        content = PygmentsParser.read_file(file)
        tokens = []
        for pair in pygments.lex(content, PygmentsParser.LEXERS[lang]):
            if any(pair[0] in sublist
                   for sublist in PygmentsParser.TYPES[lang]):
                tokens.extend(list(Subtokenizer.process_token(pair[1])))
        return Counter(tokens)
Exemple #4
0
class PygmentsParser:
    # Pygments lexers corresponding to a given language.
    LEXERS = {"Scala": ScalaLexer(),
              "Swift": SwiftLexer(),
              "Kotlin": KotlinLexer(),
              "Haskell": HaskellLexer()}
    # Pygments token types corresponding to identifiers in a given language.
    IDENTIFIERS = {"Scala": {pygments.token.Name, pygments.token.Keyword.Type},
                   "Swift": {pygments.token.Name},
                   "Kotlin": {pygments.token.Name},
                   "Haskell": {pygments.token.Name, pygments.token.Keyword.Type}}

    @staticmethod
    def get_identifiers_sequence_from_code(code: str, lang: str, identifiers_verbose: bool = False,
                                           subtokenize: bool = False) -> \
            Union[List[str], List[IdentifierData]]:
        """
        Given the code and its language, gather its identifiers.
        :param code: the code to parse.
        :param lang: the language of the code fragment.
        :param identifiers_verbose: if True, will save not only identifiers themselves,
                                    but also their parameters as IdentifierData.
        :param subtokenize: if True, will split the tokens into subtokens.
        :return: list of identifiers as either strings or IdentifierData objects.
        """
        tokens = []
        for pair in pygments.lex(code, PygmentsParser.LEXERS[lang]):
            if any(pair[0] in sublist for sublist in PygmentsParser.IDENTIFIERS[lang]):
                # TODO: implement indexes for tokens, it's possible in pygments. (0, 0, 0) for now.
                if not identifiers_verbose:
                    token = pair[1]
                else:
                    token = IdentifierData(pair[1], 0, 0, 0)
                if not subtokenize:
                    tokens.append(token)
                else:
                    tokens.extend(subtokenize_identifier(token))
        return tokens

    @staticmethod
    def get_data_from_file(file: str, lang: str, identifiers_verbose: bool = False,
                           subtokenize: bool = False) -> FileData:
        """
        Given a file and its language, return a FileData object.
        :param file: path to file.
        :param lang: the language of code.
        :param identifiers_verbose: if True, will save not only identifiers themselves,
                                    but also their parameters as IdentifierData.
        :param subtokenize: if True, will split the tokens into subtokens.
        :return: FileData object.
        """
        code = read_file(file)
        identifiers = PygmentsParser.get_identifiers_sequence_from_code(code, lang,
                                                                        identifiers_verbose,
                                                                        subtokenize)
        if identifiers_verbose:
            identifiers_type = IdentifiersTypes.VERBOSE
        else:
            identifiers_type = IdentifiersTypes.STRING
        # The "objects" are always empty, because Pygments don't support recognizing them.
        return FileData(path=file, lang=lang, objects=[], identifiers=identifiers,
                        identifiers_type=identifiers_type)
Exemple #5
0
    tokens = {
        'root': [
            (r'^(.*)(-- .*)$', bygroups(token.Text, token.Comment)),
            (r'{-(?:.|\n)*?-}', token.Comment),
            (r"'[^']+'", token.String.Char),
            (r'"[^"]+"', token.String.Char),
            # / is a \breaker and this avoid prelude url to be highlighted
            (r'(http[:a-zA-Z/\.-]+)', token.Text),
            (r'(\'\'(?:.|\n)*?\'\')', token.String.Char),
            (r'\b(\+\d+|-\d+|\d+)', token.Number.Integer),
            (r'\b(None|Some|Bool|Natural|Integer|Double|Text|Type|List|Optional)\b',
             token.Keyword.Type),
            (r'\b(%s)\b' % DhallKeywords, token.Keyword),
            (r'(%s)' % DhallWords, token.Operator.Word),
            (r'\b(True|False)\b', token.Name.Builtin.Pseudo),
            (r'-- .*$', token.Comment),
            (r',', token.Punctuation),
            (r'.', token.Text),
        ]
    }


lexers['dhall'] = DhallLexer(startinline=True)

# Alias for compat with github syntax highligher name
from pygments.lexers.shell import BashLexer
from pygments.lexers.haskell import HaskellLexer
lexers['bash'] = BashLexer(startinline=True)
lexers['haskell'] = HaskellLexer(startinline=True)
Exemple #6
0
 def create_lexer(self):
     from pygments.filters import TokenMergeFilter
     from pygments.lexers.haskell import HaskellLexer
     l = HaskellLexer()
     l.add_filter(TokenMergeFilter())
     return l
Exemple #7
0
 def create_lexer(self):
     from pygments.filters import TokenMergeFilter
     from pygments.lexers.haskell import HaskellLexer
     l = HaskellLexer()
     l.add_filter(TokenMergeFilter())
     return l