def createLexers(self): lex = {} lex['.c'] = CFamilyLexer() lex['.h'] = CFamilyLexer() lex['.cpp'] = CppLexer() lex['.hpp'] = CppLexer() lex['.css'] = CssLexer() lex['.sass'] = SassLexer() lex['.yaml'] = YamlLexer() lex['.yml'] = YamlLexer() lex['.json'] = JsonLexer() lex['.cs'] = CSharpLexer() lex['.fs'] = FSharpLexer() lex['.e'] = EiffelLexer() lex['.erl'] = ErlangLexer() lex['.hrl'] = ErlangLexer() lex['.es'] = ErlangLexer() lex['.f03'] = FortranLexer() lex['.f90'] = FortranLexer() lex['.F03'] = FortranLexer() lex['.F90'] = FortranLexer() lex['.go'] = GoLexer() lex['.hs'] = HaskellLexer() lex['.v'] = VerilogLexer() lex['.vhdl'] = VhdlLexer() lex['.vhd'] = VhdlLexer() lex['.html'] = HtmlLexer() lex['.htm'] = HtmlLexer() lex['.xhtml'] = HtmlLexer() lex['.xml'] = XmlLexer() lex['.js'] = JavascriptLexer() lex['.tex'] = TypeScriptLexer() lex['.coffee'] = CoffeeScriptLexer() lex['.java'] = JavaLexer() lex['.scala'] = ScalaLexer() lex['.kt'] = KotlinLexer() lex['.ktm'] = KotlinLexer() lex['.kts'] = KotlinLexer() lex['.lisp'] = CommonLispLexer() lex['make'] = MakefileLexer() lex['Make'] = MakefileLexer() lex['CMake'] = CMakeLexer() lex['cmake'] = CMakeLexer() lex['.m'] = MatlabLexer() lex['.mat'] = MatlabLexer() lex['.dpr'] = DelphiLexer() lex['.perl'] = PerlLexer() lex['.php'] = PhpLexer() lex['.pr'] = PrologLexer() lex['.py'] = Python3Lexer() lex['.rb'] = RubyLexer() lex['.sh'] = BashLexer() lex['.sql'] = MySqlLexer() lex['.mysql'] = MySqlLexer() lex['.tcl'] = TclLexer() lex['.awk'] = AwkLexer() return lex
def hl(s, config, style_name=None): if style_name: style = get_style_by_name(style_name) else: style = get_style_by_name(config.style) # Don't try to highlight if the string already has escape sequences if '\033[' in s: return s else: # Sometimes highlight adds an extra newline, so we remove it return highlight(s, HaskellLexer(), Terminal256Formatter(style=style)).strip()
class PygmentsParser: LEXERS = { "Scala": ScalaLexer(), "Swift": SwiftLexer(), "Kotlin": KotlinLexer(), "Haskell": HaskellLexer() } TYPES = { "Scala": {pygments.token.Name, pygments.token.Keyword.Type}, "Swift": {pygments.token.Name}, "Kotlin": {pygments.token.Name}, "Haskell": {pygments.token.Name, pygments.token.Keyword.Type} } @staticmethod def read_file(file: str) -> str: """ Read the contents of the file. :param file: the path to the file. :return: the contents of the file. """ with open(file) as fin: return fin.read() @staticmethod def get_tokens(file: str, lang: str) -> Counter: """ Gather a Counter object of tokens in the file and their count. :param file: the path to the file. :param lang: the language of file. :return: a Counter object of items: token and count. """ content = PygmentsParser.read_file(file) tokens = [] for pair in pygments.lex(content, PygmentsParser.LEXERS[lang]): if any(pair[0] in sublist for sublist in PygmentsParser.TYPES[lang]): tokens.extend(list(Subtokenizer.process_token(pair[1]))) return Counter(tokens)
class PygmentsParser: # Pygments lexers corresponding to a given language. LEXERS = {"Scala": ScalaLexer(), "Swift": SwiftLexer(), "Kotlin": KotlinLexer(), "Haskell": HaskellLexer()} # Pygments token types corresponding to identifiers in a given language. IDENTIFIERS = {"Scala": {pygments.token.Name, pygments.token.Keyword.Type}, "Swift": {pygments.token.Name}, "Kotlin": {pygments.token.Name}, "Haskell": {pygments.token.Name, pygments.token.Keyword.Type}} @staticmethod def get_identifiers_sequence_from_code(code: str, lang: str, identifiers_verbose: bool = False, subtokenize: bool = False) -> \ Union[List[str], List[IdentifierData]]: """ Given the code and its language, gather its identifiers. :param code: the code to parse. :param lang: the language of the code fragment. :param identifiers_verbose: if True, will save not only identifiers themselves, but also their parameters as IdentifierData. :param subtokenize: if True, will split the tokens into subtokens. :return: list of identifiers as either strings or IdentifierData objects. """ tokens = [] for pair in pygments.lex(code, PygmentsParser.LEXERS[lang]): if any(pair[0] in sublist for sublist in PygmentsParser.IDENTIFIERS[lang]): # TODO: implement indexes for tokens, it's possible in pygments. (0, 0, 0) for now. if not identifiers_verbose: token = pair[1] else: token = IdentifierData(pair[1], 0, 0, 0) if not subtokenize: tokens.append(token) else: tokens.extend(subtokenize_identifier(token)) return tokens @staticmethod def get_data_from_file(file: str, lang: str, identifiers_verbose: bool = False, subtokenize: bool = False) -> FileData: """ Given a file and its language, return a FileData object. :param file: path to file. :param lang: the language of code. :param identifiers_verbose: if True, will save not only identifiers themselves, but also their parameters as IdentifierData. :param subtokenize: if True, will split the tokens into subtokens. :return: FileData object. """ code = read_file(file) identifiers = PygmentsParser.get_identifiers_sequence_from_code(code, lang, identifiers_verbose, subtokenize) if identifiers_verbose: identifiers_type = IdentifiersTypes.VERBOSE else: identifiers_type = IdentifiersTypes.STRING # The "objects" are always empty, because Pygments don't support recognizing them. return FileData(path=file, lang=lang, objects=[], identifiers=identifiers, identifiers_type=identifiers_type)
tokens = { 'root': [ (r'^(.*)(-- .*)$', bygroups(token.Text, token.Comment)), (r'{-(?:.|\n)*?-}', token.Comment), (r"'[^']+'", token.String.Char), (r'"[^"]+"', token.String.Char), # / is a \breaker and this avoid prelude url to be highlighted (r'(http[:a-zA-Z/\.-]+)', token.Text), (r'(\'\'(?:.|\n)*?\'\')', token.String.Char), (r'\b(\+\d+|-\d+|\d+)', token.Number.Integer), (r'\b(None|Some|Bool|Natural|Integer|Double|Text|Type|List|Optional)\b', token.Keyword.Type), (r'\b(%s)\b' % DhallKeywords, token.Keyword), (r'(%s)' % DhallWords, token.Operator.Word), (r'\b(True|False)\b', token.Name.Builtin.Pseudo), (r'-- .*$', token.Comment), (r',', token.Punctuation), (r'.', token.Text), ] } lexers['dhall'] = DhallLexer(startinline=True) # Alias for compat with github syntax highligher name from pygments.lexers.shell import BashLexer from pygments.lexers.haskell import HaskellLexer lexers['bash'] = BashLexer(startinline=True) lexers['haskell'] = HaskellLexer(startinline=True)
def create_lexer(self): from pygments.filters import TokenMergeFilter from pygments.lexers.haskell import HaskellLexer l = HaskellLexer() l.add_filter(TokenMergeFilter()) return l