def findMain(code): """ Look for the existence of if __name__ == '__main__' Documentation: https://docs.python.org/2/tutorial/modules.html in 6.1.1 """ found = False pos = 0 lexer = PythonLexer() tokens_1 = pygments.lex(code, lexer) tokens_2 = pygments.lex(code, lexer) sequence_1 = [(Token.Keyword, '^if$'), (Token.Name, '^__name__$'), (Token.Operator, '^==$'), (Token.Literal.String.Double, '^__main__$'), (Token.Punctuation, '^:$')] sequence_2 = [(Token.Keyword, '^if$'), (Token.Name, '^__name__$'), (Token.Operator, '^==$'), (Token.Literal.String.Single, '^__main__$'), (Token.Punctuation, '^:$')] mainIdiom = PythonIdiom('ifNameMain') lineNum = _findSeqInTokens(sequence_1, tokens_1) if lineNum < 0: lineNum = _findSeqInTokens(sequence_2, tokens_2) if lineNum > 0: mainIdiom.addNew(lineNum) log("If name main found in lines: " + str(mainIdiom.getLines())) return mainIdiom
def test_bare_class_handler(): from pygments.formatters import HtmlFormatter from pygments.lexers import PythonLexer try: lex('test\n', PythonLexer) except TypeError as e: assert 'lex() argument must be a lexer instance' in str(e) else: assert False, 'nothing raised' try: format([], HtmlFormatter) except TypeError as e: assert 'format() argument must be a formatter instance' in str(e) else: assert False, 'nothing raised'
def BuildTags(self, buff, lexer): """ @param buff: code buffer @param lexer: xml lexer @return: taglib.DocStruct instance for the given buff """ rtags = taglib.DocStruct() rtags.SetElementDescription(self.TAG_ID, '/') line_count = 0 current_line = [] code_lines = [] # Parse the file into tokens and values for ttype, value in lex(buff.read(), lexer): if '\n' in value: if len(current_line) > 0: code_lines.append((line_count, current_line)) current_line = [] line_count += value.count('\n') continue if ttype == Token.Name.Tag and len(value) > 1: current_line.append((ttype, value)) docroot = self.Parse(code_lines) if docroot != None: rtags.AddElement(self.TAG_ID, docroot) return rtags
def highlightMultiSource(codeLexerTuples, multiSourceFormatter, outfile=None): """ main function to create formatted output based on tuples of code and related metadata (lexing information and title to display) """ if not isinstance(codeLexerTuples, tuple): raise TypeError( "first highlight() argument must be a tupel of \ codeLexerTuple" ) if not isinstance(multiSourceFormatter, Formatter): raise TypeError( "second highlight() argument must be a \ MultiSourceFormatter" ) tokensList = [] for codeLexerTuple in codeLexerTuples: tokensList.append(lex(codeLexerTuple.code, codeLexerTuple.lexer)) multiSourceFormatter.titles.append(codeLexerTuple.title) if not outfile: # print formatter, 'using', formatter.encoding realoutfile = multiSourceFormatter.encoding and BytesIO() or StringIO() multiSourceFormatter.format(tokensList, realoutfile) return realoutfile.getvalue() else: multiSourceFormatter.format(tokensList, outfile)
def style_ansi(raw_code, lang=None): """ actual code hilite """ lexer = 0 if lang: try: lexer = get_lexer_by_name(lang) except ValueError: print col(R, 'Lexer for %s not found' % lang) lexer = None if not lexer: try: if guess_lexer: lexer = pyg_guess_lexer(raw_code) except: pass if not lexer: lexer = get_lexer_by_name(def_lexer) tokens = lex(raw_code, lexer) cod = [] for t, v in tokens: if not v: continue _col = code_hl_tokens.get(t) if _col: cod.append(col(v, _col)) else: cod.append(v) return ''.join(cod)
def findBadUseImport(code): """ Find when use from foo import * Documentation: http://python.net/~goodger/projects/pycon/2007/idiomatic/handout.html#importing https://docs.python.org/2/howto/doanddont.html#from-module-import """ sequence = [(Token.Keyword.Namespace, '^from$'), (Token.Name.Namespace, '.*'), (Token.Keyword.Namespace, '^import$'), (Token.Operator, '\*')] lexer = PythonLexer() lexer.add_filter('tokenmerge') tokens = pygments.lex(code, lexer) badUseImport = PythonIdiom('badImport') lineNumber = 1 while True: lineAux = _findSeqInTokens(sequence, tokens) if lineAux < 0: break lineNumber += lineAux -1 badUseImport.addNew(lineNumber) log("badUseImport found in lines {0}".format(badUseImport.getLines())) return badUseImport
def __init__(self, disassembly, lexer=lexer, msg=None): self.lines = [] if isinstance(disassembly, list): self.lines = disassembly elif disassembly: line = [] if msg: current_function = msg.rsplit(None, 1)[-1][:-1] else: current_function = None with currentfunctiontfilter.current_function(current_function): for ttype, value in pygments.lex(disassembly, lexer): if '\n' in value: self.lines.append(DisassemblyLine(line)) line = [] else: line.append((ttype, value)) self.linenos = {} for i, line in enumerate(self.lines): self.linenos[line.address] = line, i self.lexer = lexer self.msg = msg
def lex(self, code, lex): """Return tokenified code. Return a list of tuples (scope, word) where word is the word to be printed and scope the scope name representing the context. :param str code: Code to tokenify. :param lex: Lexer to use. :return: """ if lex is None: if not type(code) is str: # if not suitable lexer is found, return decoded code code = code.decode("utf-8") return (("global", code),) words = pygments.lex(code, lex) scopes = [] for word in words: token = word[0] scope = "global" if token in self.token_map.keys(): scope = self.token_map[token] scopes.append((scope, word[1])) return scopes
def main(): arguments = docopt( __doc__.format( program=docstring_format_dict ), version= '{docstring_format_dict["human_format"]} 2.0', options_first= True ) lexer = BibtexLexer() lexer.add_filter( RaiseOnErrorTokenFilter() ) #lexer.add_filter( TokenMergeFilter() ) lexer.add_filter( KeywordCaseFilter(case='lower') ) for f in arguments['<file>']: # get bibtex source code = None with open(f, 'r') as f: code = ''.join( f.readlines() ) # NOW LEX SEE CODE! for idx, item in enumerate(pygments.lex(code, lexer)): tokentype, tokenvalue = item[0], item[1] # if tokentype in frozenset([Token.Text.Whitespace, Token.Punctuation]): # continue print( "{0:>5}\t{1[0]!s:<25}\t{1[1]!r}".format(idx, item), file=sys.stdout )
def _lexContents(self): # We add a space in front because otherwise the lexer will discard # everything up to the first token, meaning that we lose the potentially # empty first lines and mess up the matching. With the space, we force # the lexer to process the initial \n. and we just skip the space token tokens = list(pygments.lex(" "+self._document.documentText(), pygments.lexers.PythonLexer())) self._document.beginTransaction() current_line_num = 1 meta = [] # Skip the space token for token in tokens[1:]: ttype, string = token meta.extend([ttype]*len(string)) if string.endswith('\n'): self._document.deleteCharMeta( (current_line_num,1), self._document.lineLength(current_line_num), CharMeta.LexerToken) self._document.updateCharMeta((current_line_num,1), {CharMeta.LexerToken: meta}) current_line_num += 1 meta = [] self._document.endTransaction()
def filename(self, value): "Set the file being displayed by the view" if self._filename != value: self.code.delete('1.0', END) with open(value) as code: all_content = code.read() if self.lexer: lexer = self.lexer else: lexer = guess_lexer_for_filename(value, all_content, stripnl=False) for token, content in lex(all_content, lexer): self.code.insert(END, content, str(token)) # Now update the text for the linenumbers end_index = self.code.index(END) line_count = int(end_index.split('.')[0]) lineNumbers = '\n'.join('%5d' % i for i in range(1, line_count)) self.lines.config(state=NORMAL) self.lines.delete('1.0', END) self.lines.insert('1.0', lineNumbers) self.lines.config(state=DISABLED) # Store the new filename, and clear any current line self._filename = value self._line = None
def SyntexHighlight(self, event=None): from tkinter.font import Font for tag in self.tag_names(): self.tag_delete(tag) self.mark_set("range_start", "1.0") data = self._get_value() self.tag_configure("Token.Comment", foreground="#F00") bolder = Font(family=self.app.cnf['font'][0]) bolder.config(size=self.app.cnf['font'][1]-2) bolder.config(weight="bold") for token, content in lex(data, PythonLexer()): self.mark_set("range_end", "range_start + %dc" % len(content)) self.tag_add(str(token), "range_start", "range_end") self.mark_set("range_start", "range_end") self.tag_config("Token.Comment.Single", foreground="#F00") self.tag_config("Token.Literal.String.Doc", foreground="#F00") for tag in self.tag_names(): if 'Token.Keyword' == tag: self.tag_config(tag, foreground="#008", font=bolder) elif 'Token.Keyword.Namespace' == tag: self.tag_config(tag, foreground="#00F", font=bolder) elif 'Token.Name.Class' in tag: self.tag_config(tag, foreground="#F30", background='#AFA') elif 'Token.Name.Function' in tag: self.tag_config(tag, foreground="#A3A", background='#FFA') elif 'Token.Literal' in tag: self.tag_config(tag, foreground="#6A0") elif 'Token.Operator' in tag: self.tag_config(tag, foreground="#A3A") print(self.tag_names())
def _generator(): lexer = self.create_lexer() raw = pygments.lex(code_region.text, lexer) row, col = 0, 0 begin_column = False for type, value in raw: tok = Tok(type, value, row, col, begin_column) if '\n' in value: # Pygments doesn't necessarily split spaces and newlines into # separate tokens, so we do it ourselves, ensuring there is always # a token at column 0 of every row spaces = value.split('\n') begin_column = False if spaces[0]: yield Tok(type, spaces[0], row, col) col += len(spaces[0]) for sp in spaces[1:]: yield Tok(Token.Text, '\n', row, col) row += 1 col = 0 if not sp: continue yield Tok(Token.Text, sp, row, col) begin_column = True col += len(sp) continue if tok.is_whitespace(): if len(value) > 1: begin_column = True else: begin_column = False col += len(value) yield tok
def check(*expected): text = ''.join(i[1] for i in expected) md_lexer = MarkdownLexer() md_lexer.add_filter('raiseonerror') md_lexer.add_filter('tokenmerge') result = list(pygments.lex(text, md_lexer)) assert result == list(expected)
def __init__(self, pdf, code, lexer): self.pdf = pdf fname, fstyle, fsize = self.pdf.theme["code-font"] self.pdf.set_font(fname, fstyle, fsize) style = pygments.styles.get_style_by_name("emacs") style = dict(style) for token, text in pygments.lex(code["code"], lexer): token_style = style[token] if token_style["color"]: r, g, b = map(ord, token_style["color"].decode("hex")) else: r, g, b = (0, 0, 0) self.pdf.set_text_color(r, g, b) if token_style["bold"] and token_style["italic"]: self.pdf.set_font(fname, "BI", fsize) elif token_style["bold"]: self.pdf.set_font(fname, "B", fsize) elif token_style["italic"]: self.pdf.set_font(fname, "I", fsize) else: self.pdf.set_font(fname, "", fsize) height = pdf.theme["code-height"] self.pdf.write(height, text)
def findMagicMethods(code): """ Search magic methods in the code and returns a list of how many have been found, what kind of dificult it has and wich where Documentation: http://www.rafekettler.com/magicmethods.html Python Pocket Reference page 88 """ lexer = PythonLexer() tokens = pygments.lex(code, lexer) lineNumber = 1 methodsFound = [] methodsIdiom1 = PythonIdiom('idiomMethods1') methodsIdiom2 = PythonIdiom('idiomMethods2') methodsIdiom3 = PythonIdiom('idiomMethods3') for ttype, word in tokens: lineNumber += _getNewLines((ttype, word)) if ttype is Token.Name.Function: if word in magicMethods_1: methodsIdiom1.addNew(lineNumber, otherInfo={'method': word}) methodsFound.append(word) elif word in magicMethods_2: methodsIdiom2.addNew(lineNumber, otherInfo={'method': word}) methodsFound.append(word) elif word in magicMethods_3: methodsIdiom3.addNew(lineNumber, otherInfo={'method': word}) methodsFound.append(word) log("MagicMethods: %s" % str(methodsFound)) return [methodsIdiom1, methodsIdiom2, methodsIdiom3]
def basicStructure(code): sequence = [] lexer = PythonLexer() lexer.add_filter('tokenmerge') tokens = pygments.lex(code, lexer) for token in tokens: print token
def checkBadLoopCollect(code): """ Look for bad loop like 'for i in range(len(list))' Documentation: https://youtu.be/OSGv2VnC0go?t=4m47s """ sequence = [(Token.Keyword, '^for$'), (Token.Name, '^\w+$'), (Token.Operator.Word, '^in$'), (Token.Name.Builtin, '^range$|^xrange$'), (Token.Punctuation, '^\($'), (Token.Name.Builtin, '^len$'), (Token.Punctuation, '^\($'), (Token.Name, '^\w+$')] lexer = PythonLexer() lexer.add_filter('tokenmerge') tokens = pygments.lex(code, lexer) badLoopCollectIdiom = PythonIdiom('badLoop') lineNumber = 1 while True: lineAux = _findSeqInTokens(sequence, tokens) if lineAux < 0: break lineNumber += lineAux -1 badLoopCollectIdiom.addNew(lineNumber) log("badLoopCollect found in lines {0}".format(badLoopCollectIdiom.getLines())) return badLoopCollectIdiom
def getCodeStyleChunks(self, node): assert node.tag == 'code' lang = node.attrib.get('lang','python') #@TODO: error handling if lang is bad lexer = pygments.lexers.get_lexer_by_name(lang) for tok, text in pygments.lex(e2txt(node), lexer): yield [[tok]], text
def checkNotRange(code): """ Check if there is: for xx in [0,1,2] instead of for xxx in (x)range Documentation: https://youtu.be/OSGv2VnC0go?t=3m4s """ sequence = [(Token.Keyword, '^for$'), (Token.Name, '^\w+$'), (Token.Operator.Word, '^in$'), (Token.Punctuation, '^\[$'), (Token.Literal.Number.Integer, '^\d$')] lexer = PythonLexer() lexer.add_filter('tokenmerge') tokens = pygments.lex(code, lexer) notRangeIdiom = PythonIdiom('notRange') lineNumber = 1 while True: lineAux = _findSeqInTokens(sequence, tokens) if lineAux < 0: break lineNumber += lineAux -1 notRangeIdiom.addNew(lineNumber) log("badForIn found in lines {0}".format(notRangeIdiom.getLines())) return notRangeIdiom
def findDocstring(code): """Find the use of documentation in the functions, classes or script Documentation: https://www.python.org/dev/peps/pep-0257/ """ lexer = PythonLexer() lexer.add_filter('tokenmerge') classDefToken = (Token.Keyword, '^class$') functDefToken = (Token.Keyword, '^def$') tokens = pygments.lex(code, lexer) docIdiom = PythonIdiom('docstring') docstringFound = defaultdict(int) typeDoc = 'module' lineNumber = 1 for ttype, word in tokens: if _sameToken((ttype, word), classDefToken): typeDoc = 'class' elif _sameToken((ttype, word), functDefToken): typeDoc = 'function' elif ttype == Token.Literal.String.Doc: docstringFound[typeDoc] += 1 docIdiom.addNew(lineNumber) lineNumber += _getNewLines((ttype, word)) for typeDoc in docstringFound: log("type %s: %d found" % (typeDoc, docstringFound[typeDoc])) log('DocString found in lines: ' + str(docIdiom.getLines())) return docIdiom
def lex(code_lines, lexername): try: from pygments.lexers import get_lexer_by_name from pygments import lex except ImportError: print('For lexer support please install extras: pip install sourcemap-tool[lexer]', file=stderr) exit(1) # TODO: join lexemes with trailing space, remove comment lexemes lexer = get_lexer_by_name(lexername) tokens = lex(''.join(code_lines), lexer) result = [] line = [] for _, text in tokens: parts = text.split('\n') if len(parts) > 1: # multiline token first = True for part in parts: if not first: result.append(line) line = [] first = False if len(part) > 0: line.append(len(part)) else: if len(text) > 0: line.append(len(text)) if line: result.append(line) return result
def scan_source(fp, args): # print("scanning: %r" % fp) global filepath filepath = fp filepath_base = os.path.basename(filepath) #print(highlight(code, CLexer(), RawTokenFormatter()).decode('utf-8')) code = open(filepath, 'r', encoding="utf-8").read() tokens[:] = [] line = 1 for ttype, text in lex(code, CLexer()): tokens.append(TokStore(ttype, text, line)) line += text.count("\n") col = 0 # track line length index_line_start = 0 for i, tok in enumerate(tokens): #print(tok.type, tok.text) if tok.type == Token.Keyword: if tok.text in {"switch", "while", "if", "for"}: item_range = extract_statement_if(i) if item_range is not None: blender_check_kw_if(item_range[0], i, item_range[1]) elif tok.text == "else": blender_check_kw_else(i) elif tok.type == Token.Punctuation: if tok.text == ",": blender_check_comma(i) elif tok.type == Token.Operator: # we check these in pairs, only want first if tokens[i - 1].type != Token.Operator: op, index_kw_end = extract_operator(i) blender_check_operator(i, index_kw_end, op) elif tok.type in Token.Comment: doxyfn = None if "\\file" in tok.text: doxyfn = tok.text.split("\\file", 1)[1].strip().split()[0] elif "@file" in tok.text: doxyfn = tok.text.split("@file", 1)[1].strip().split()[0] if doxyfn is not None: doxyfn_base = os.path.basename(doxyfn) if doxyfn_base != filepath_base: warning("doxygen filename mismatch %s != %s" % (doxyfn_base, filepath_base), i, i) # ensure line length if (not args.no_length_check) and tok.type == Token.Text and tok.text == "\n": # check line len blender_check_linelength(index_line_start, i - 1, col) col = 0 index_line_start = i + 1 else: col += len(tok.text.expandtabs(TAB_SIZE))
def parse(s, l): ret_list = [] start = 0 for token in lex(s, l): color = determine_color(token[0]) ret_list.append((start, token[1], color)) start += len(token[1]) print ret_list
def lex(self): # Get lexer for language (use text as fallback) try: lexer = get_lexer_by_name(self.language) except ValueError: # info: "no pygments lexer for %s, using 'text'"%self.language lexer = get_lexer_by_name('text') return pygments.lex(self.code, lexer)
def findUpdateVariables1Line(code): """ Look for lines of code like this: 'x, y = 0, 1' or 'x, y = y, x+y' """ lexer = PythonLexer() tokens = pygments.lex(code, lexer) linesFound = [] assignIdiom = PythonIdiom('assignOneLine') # Tokens variables nameToken = (Token.Name, '^\w+$') equalToken = (Token.Operator, '^\=$') newLineToken = (Token.Text, '\n') commaToken = (Token.Punctuation, '^,$') # To advoid mistakes, I count the variables before/after the equal numVarPrevEqual = 0 numVarPostEqual = 0 numCommas = 0 beforeEqual = True actualLine = '' ignoreLine = False lineNumber = 1 for ttype, word in tokens: if not _ignoreStr(word): actualLine += word.encode('utf-8') lineNumber += _getNewLines((ttype, word)) if _sameToken((ttype, word), newLineToken): beforeEqual = True if numVarPrevEqual == numVarPostEqual and numVarPrevEqual > 1: if not ignoreLine: linesFound.append(actualLine) assignIdiom.addNew(lineNumber-1) # -1 because waits until the line finish actualLine= '' numVarPrevEqual, numVarPostEqual, numCommas, ignoreLine = 0, 0, 0, False continue if ignoreLine: continue if _sameToken((ttype, word), equalToken): if not beforeEqual: ignoreLine = True beforeEqual = False numCommas = 0 elif _sameToken((ttype, word), commaToken): numCommas += 1 if beforeEqual: if _sameToken((ttype, word), nameToken) and (numCommas == numVarPrevEqual): numVarPrevEqual += 1 else: if re.match('\w+', word.encode('utf-8')) and (numCommas == numVarPostEqual): numVarPostEqual += 1 log("Update in 1 line. Found: " + str(linesFound)) log("Update in 1 line found in lines " + str(assignIdiom.getLines())) return assignIdiom
def show_it(string, tf, width=80): tf.reset(width) print('=' * 30) for t in lex(string, rst_lex): print(t) pass print('-' * 30) print(highlight(string, rst_lex, tf)) return
def test_3(self): # Note that this will add a newline to the lexed output, since the # `ensurenl <http://pygments.org/docs/lexers/>`_ option is True by # default. lexer = get_lexer_by_name('python') token_iter = lex('', lexer) # Capture both group and string for help in debugging. token_group = list(_group_lexer_tokens(token_iter, True, False)) assert token_group == [(_GROUP.whitespace, '\n')]
def test_1(self): test_py_code = '# A comment\nan_identifier\n' test_token_list = [(Token.Comment.Single, '# A comment'), (Token.Text, '\n'), (Token.Name, 'an_identifier'), (Token.Text, '\n')] lexer = get_lexer_by_name('python') token_list = list( lex(test_py_code, lexer) ) assert token_list == test_token_list
def _lex(self, lexer): """ Lex the document. """ current_location = 0 for token, text in pygments.lex(str(self.raw_text_document), lexer): stop_position = current_location + len(text) flat_slice = FlatSlice(current_location, stop_position) self.append(HighlightedTextFragment(flat_slice, token)) current_location = stop_position
def highlight(self, block): """Method called on each block to highlight it content""" tokens = pygments.lex(block, self.python_lexer) if self.format_rst: from pygments.token import Token toks = [] for token in tokens: if token[0] == Token.String.Doc and len(token[1]) > 6: toks += pygments.lex(token[1][:3], self.python_lexer) # parse doc string content by rst lexer toks += pygments.lex(token[1][3:-3], self.rst_lexer) toks += pygments.lex(token[1][-3:], self.python_lexer) elif token[0] == Token.Comment.Single: toks.append((Token.Comment.Single, token[1][0])) # parse comment content by rst lexer # remove the extra newline added by rst lexer toks += list(pygments.lex(token[1][1:], self.rst_lexer))[:-1] else: toks.append(token) tokens = toks return pygments.format(tokens, self.formatter)
def capture_comment(content, lexer, start): # look backward to capture the entire comment in case we are the middle of a multiline comment comment_start = comment_end = start for line in reversed(content[:start]): ttypes = [t for t, _ in pygments.lex(line, lexer)] # if a line has no keyword, name or operator # and has a comment token we assume it is a part of the initial comment if is_a_comment_line_java(ttypes): comment_start -= 1 else: break # look forward to capture the entire comment in case we are the middle of a multiline comment for line in content[start:]: ttypes = [t for t, _ in pygments.lex(line, lexer)] if is_a_comment_line_java(ttypes): comment_end += 1 else: break comment = content[comment_start:comment_end] return comment, comment_end
def extract_code(start_lineno, file_name): with open(file_name.as_posix(), mode='r', encoding='iso-8859-1') as f: content = f.readlines() lexer = build_lexer('java') # content array is 0 index so need to shift down by 1 start_lineno = max(0, start_lineno - 1) comment, comment_end = capture_comment(content, lexer, start_lineno) to_extract_content = content[comment_end:] code_end = 1 heuristic = None block_count = 0 for i, line in enumerate(to_extract_content): tokens = list(pygments.lex(line, lexer)) should_stop, reason = Heuristic.should_stop_java(tokens) if should_stop and block_count == 0: heuristic = reason code_end = i break if heuristic == Heuristic.CLOSE_PAREN: code_end = min(code_end, len(to_extract_content)) code = capture_code(code_end, lexer, to_extract_content) comment = strip_special_chars(comment) elif heuristic == Heuristic.NEXT_COMMENT: code_end = min(code_end, len(to_extract_content)) code = capture_code(code_end, lexer, to_extract_content) comment = strip_special_chars(comment) else: code_end = min(code_end + 5, len(to_extract_content)) code = capture_code(code_end + 1, lexer, to_extract_content) comment = strip_special_chars(comment) # if "Close the server and confirm it saw what we expected." in comment: # set_trace() # skipping comment and code are on the same line case # if not comment: # if len(content) - 1 < start_lineno: # print("Length of content is less than start_line {}".format( # file_name.as_posix())) # return None, None, None # ttypes = [t for t, _ in pygments.lex(content[start_lineno], lexer)] # if is_a_code_line(ttypes) and contains_a_comment(ttypes): # line = content[start_lineno].split("//") # if len(line) != 2: # return None, None, None # code, comment = line[:-1], line[-1] # code = [w.strip() for w in code] # comment = comment.strip().replace("\n", "\\n") return clean_comment(comment), clean_code(code), heuristic
def initial_highlight(self, *args): content = self.text.get("1.0", tk.END) self.text.mark_set("range_start", "1.0") data = self.text.get("1.0", tk.END) for token, content in lex(data, self.lexer): self.text.mark_set("range_end", "range_start + %dc" % len(content)) self.text.tag_add(str(token), "range_start", "range_end") self.text.mark_set("range_start", "range_end") self.previousContent = self.text.get("1.0", tk.END) self.syntax_theme_configuration()
def _render_highlighted_block(self, content, language): code = indent(content, " " * 2) lexer = get_lexer(language or "") if lexer: formatted_text = PygmentsTokens( pygments.lex(code=code, lexer=lexer)) else: formatted_text = to_formatted_text( code, style="", ) return formatted_text
def run_seq(seq): tokens = list(pygments.lex(seq, TypeScriptLexer())) ws, tokens = prep(tokens) # Set up tensors inputs = np.zeros(len(ws)) outputs = np.zeros(len(ws)) for i in range(len(ws)): inputs[i] = source_dict[ ws[i]] if ws[i] in source_dict else source_dict["_UNKNOWN_"] N = len(inputs) if N > 4 * minibatch_size: return None inputs = scipy.sparse.csr_matrix( (np.ones(N, np.float32), (range(N), inputs)), shape=(N, vocab_size)) outputs = scipy.sparse.csr_matrix( (np.ones(N, np.float32), (range(N), outputs)), shape=(N, num_labels)) sIn = C.io.MinibatchSourceFromData( dict(xx=([inputs], C.layers.typing.Sequence[C.layers.typing.tensor]), yy=([outputs], C.layers.typing.Sequence[C.layers.typing.tensor]))) mb = sIn.next_minibatch(N) data = {x: mb[sIn.streams['xx']], y: mb[sIn.streams['yy']]} enhance_data(data, enc) pred = dec.eval({x: data[x], t: data[t]})[0] with open(outp, 'w', encoding="utf-8") as f: ix = 0 sep = chr(31) for tt, v, in tokens: f.write("%s%s%s" % (v.replace("\t", "\\t").replace( "\n", "\\n").replace("\r", "\\r"), sep, str(tt)[6:])) print(v, end='') if v.strip() == '' or tt in Comment: f.write('\n') continue pr = pred[ix] ix += 1 if v.strip() in keywords or not bool(regex.match(v.strip())): f.write('\n') continue r = [ i[0] for i in sorted( enumerate(pr), key=lambda x: x[1], reverse=True) ] guess = target_wl[r[0]] gs = [target_wl[r[ix]] for ix in range(5)] gs = [g[1:len(g) - 1] if g[0] == "$" else g for g in gs] if target_wl[r[0]] != "O": print(" : %s" % guess[1:len(guess) - 1], end='') for i in range(len(gs)): f.write("%s%s%s%.4f" % (sep, gs[i], sep, pr[r[i]])) f.write('\n') print()
def __init__(self, file_path, var_table=None): self.program_text = open(file_path, "r").read() self.tokens = list(lex(self.program_text, HexRaysCLexer())) # Maps a placeholder id to a dict of variable names self.var_table = dict() if var_table: with open(var_table, newline="") as tablefile: reader = csv.DictReader(tablefile, delimiter=",", quotechar="|") for row in reader: self.var_table[row.pop("var_id")] = row
def _highlight(self, start_pos, text): """テキストをハイライトする.""" self.text.mark_set('range_start', start_pos) for token, content in lex(text, PythonLexer()): self.text.mark_set( 'range_end', 'range_start+{0}c'.format(len(content)) ) self.text.tag_add(str(token), 'range_start', 'range_end') self.text.mark_set('range_start', 'range_end') # import名、関数名、クラス名は補完リストに使うので保存しておく if str(token) in ('Token.Name.Namespace', 'Token.Name.Class', 'Token.Name.Function'): self.var_name_list.add(content)
def test_bare_class_handler(): from pygments.formatters import HtmlFormatter from pygments.lexers import PythonLexer try: lex('test\n', PythonLexer) except TypeError as e: assert 'lex() argument must be a lexer instance' in str(e) else: assert False, 'nothing raised' try: format([], HtmlFormatter) except TypeError as e: assert 'format() argument must be a formatter instance' in str(e) else: assert False, 'nothing raised' # These cases should not trigger this heuristic. class BuggyLexer(RegexLexer): def get_tokens(self, text, extra_argument): pass tokens = {'root': []} try: list(lex('dummy', BuggyLexer())) except TypeError as e: assert 'lex() argument must be a lexer instance' not in str(e) else: assert False, 'no error raised by buggy lexer?' class BuggyFormatter(Formatter): def format(self, tokensource, outfile, extra_argument): pass try: format([], BuggyFormatter()) except TypeError as e: assert 'format() argument must be a formatter instance' not in str(e) else: assert False, 'no error raised by buggy formatter?'
def get_tokenization(lexedWoComments, lexer): tokenized_string = '' token_types = [] curr_line_empty = True for t in lexedWoComments: token_type = str(t[0]) token = t[1] token_stripped = token.strip() # Pygments will sometimes lex many tokens as one # This can occur with preprocessor directives and definitions in C # In this case, we need to lex that whole line num_tokens = len(token.split()) if num_tokens > 1: # Need to manually lex each space seperated token on occassions # when pygments doesn't lex properly line_split = token.split() line_lexed = [] for temp_token in line_split: token_lexed = list(lex(temp_token, lexer)) for lexed in token_lexed: if lexed[1] != "\n": line_lexed.append(lexed) line_lexed.append((Token.Text, '\n')) line_code, line_types = get_tokenization(line_lexed, lexer) tokenized_string += line_code token_types += line_types curr_line_empty = True continue if '\n' in token: if curr_line_empty: if (t[0] != Token.Text or t[0] != Token.Comment.Preproc ) and token_stripped != '': tokenized_string += token_stripped + "\n" token_types.append(token_type) else: tokenized_string += token_stripped + "\n" # Edge case for stray "\" in code if token_stripped == "\\": token_types.append(token_type) curr_line_empty = True elif t[0] != Token.Text and len(token_stripped) > 0: curr_line_empty = False tokenized_string += token + ' ' token_types.append(token_type) assert len( tokenized_string.split()) == len(token_types), "{0} != {1}".format( len(tokenized_string.split()), len(token_types)) return tokenized_string, token_types
def _parse_led_config(file, matrix_cols, matrix_rows): """Return any 'raw' led/rgb matrix config """ matrix_raw = [] position_raw = [] flags = [] found_led_config = False bracket_count = 0 section = 0 for _type, value in lex(_preprocess_c_file(file), CLexer()): # Assume g_led_config..stuff..; if value == 'g_led_config': found_led_config = True elif value == ';': found_led_config = False elif found_led_config: # Assume bracket count hints to section of config we are within if value == '{': bracket_count += 1 if bracket_count == 2: section += 1 elif value == '}': bracket_count -= 1 else: # Assume any non whitespace value here is important enough to stash if _type in [ Token.Literal.Number.Integer, Token.Literal.Number.Float, Token.Literal.Number.Hex, Token.Name ]: if section == 1 and bracket_count == 3: matrix_raw.append(_coerce_led_token(_type, value)) if section == 2 and bracket_count == 3: position_raw.append(_coerce_led_token(_type, value)) if section == 3 and bracket_count == 2: flags.append(_coerce_led_token(_type, value)) # Slightly better intrim format matrix = list(_get_chunks(matrix_raw, matrix_cols)) position = list(_get_chunks(position_raw, 2)) matrix_indexes = list(filter(lambda x: x is not None, matrix_raw)) # If we have not found anything - bail with no error if not section: return None # Throw any validation errors _validate_led_config(matrix, matrix_rows, matrix_indexes, position, position_raw, flags) return (matrix, position, flags)
def preprocessFile(path, basePath, retainLine): """ Perform preprocessing on the lexer. Parameters: ----------- path basePath retainLine - do we keep the original line numbers or not Returns: ----------- (curProject - The current project or corpora we are in curFile - The corresponding original file path lexedWoComments - the Pygments token list with preprocessing OR (Not yet implemented) Something for English?, language - the language of this lexer fileErrorCount - count of observed error tokens from Pygments) """ if (True): #TODO is a programming language. components = path.split(".") fileContents = "" fileContents = ''.join(open(path, 'r').readlines()) lexer = get_lexer_for_filename(path) tokens = lex(fileContents, lexer) # returns a generator of tuples tokensList = list(tokens) language = languageForLexer(lexer) (curProject, curFile) = getProjectAndFilename(path, basePath) #Debug: what does the original token set look like #print(tokensList) #quit() if (retainLine): lexedWoComments = reduceToNewLine(tokensList, Token.Comment) lexedWoComments = reduceToNewLine(lexedWoComments, Token.Literal.String.Doc) else: # Strip comments and alter strings lexedWoComments = tokensExceptTokenType(tokensList, Token.Comment) lexedWoComments = tokensExceptTokenType(lexedWoComments, Token.Literal.String.Doc) beforeError = len(lexedWoComments) #Remove Things than didn't lex properly lexedWoComments = tokensExceptTokenType(lexedWoComments, Token.Error) fileErrorCount = beforeError - len(lexedWoComments) #Alter the pygments lexer types to be more comparable between our #languages lexedWoComments = fixTypes(lexedWoComments, language) lexedWoComments = convertNamespaceTokens(lexedWoComments, language) return (curProject, curFile, lexedWoComments, language, fileErrorCount)
def format_line(fname, lineno, line, color=True, lexer=None, formatter=None): """Formats a trace line suitable for printing.""" fname = min(fname, replace_home(fname), os.path.relpath(fname), key=len) if not color: return COLORLESS_LINE.format(fname=fname, lineno=lineno, line=line) cline = COLOR_LINE.format(fname=fname, lineno=lineno) if not HAVE_PYGMENTS: return cline + line # OK, so we have pygments tokens = pyghooks.partial_color_tokenize(cline) lexer = lexer or pyghooks.XonshLexer() tokens += pygments.lex(line, lexer=lexer) return tokens
def lex(self): # Get lexer for language (use text as fallback) try: if self.language and str(self.language).lower() != 'none': lexer = get_lexer_by_name(self.language.lower(), **self.custom_args) else: lexer = get_lexer_by_name('text', **self.custom_args) except ValueError: log.info("no pygments lexer for %s, using 'text'" % self.language) # what happens if pygment isn't present ? lexer = get_lexer_by_name('text') return pygments.lex(self.code, lexer)
def lex(self): """Get lexer for language (use text as fallback)""" try: if self.language and unicode(self.language).lower() <> 'none': lexer = get_lexer_by_name(self.language.lower(), **self.custom_args ) else: lexer = get_lexer_by_name('text', **self.custom_args) except ValueError: # what happens if pygment isn't present ? lexer = get_lexer_by_name('text') return pygments.lex(self.code, lexer)
def print_result(self, data=None): data = data or self.data if isinstance(data, dict): data = self._process_dict(data) elif isinstance(data, (list, tuple)): data = [ v.decode('utf-8') if isinstance(v, bytes) else v for v in data ] elif isinstance(data, bytes): data = data.decode('utf-8') tokens = list( pygments.lex(json.dumps(data, indent=4), lexer=JsonLexer())) print_formatted_text(PygmentsTokens(tokens))
def parse(self, start='1.0'): data = self.get(start, 'end') while data and '\n' == data[0]: start = self.index('%s+1c' % start) data = data[1:] self.mark_set('range_start', start) for t in self._syntax_highlighting_tags: self.tag_remove(t, start, "range_start +%ic" % len(data)) for token, content in lex(data, Python3Lexer()): self.mark_set("range_end", "range_start + %ic" % len(content)) for t in token.split(): self.tag_add(str(t), "range_start", "range_end") self.mark_set("range_start", "range_end")
def display_bibs(labels, bibs): r""" Display a list of bib entries on screen with flying colors. Parameters ---------- labels: List of Strings Header labels to show above each Bib() entry. bibs: List of Bib() objects BibTeX entries to display. Examples -------- >>> import bibmanager.bib_manager as bm >>> e1 = '''@Misc{JonesEtal2001scipy, author = {Eric Jones and Travis Oliphant and Pearu Peterson}, title = {{SciPy}: Open source scientific tools for {Python}}, year = {2001}, }''' >>> e2 = '''@Misc{Jones2001, author = {Eric Jones and Travis Oliphant and Pearu Peterson}, title = {SciPy: Open source scientific tools for Python}, year = {2001}, }''' >>> bibs = [bm.Bib(e1), bm.Bib(e2)] >>> bm.display_bibs(["DATABASE:\n", "NEW:\n"], bibs) :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: DATABASE: @Misc{JonesEtal2001scipy, author = {Eric Jones and Travis Oliphant and Pearu Peterson}, title = {{SciPy}: Open source scientific tools for {Python}}, year = {2001}, } NEW: @Misc{Jones2001, author = {Eric Jones and Travis Oliphant and Pearu Peterson}, title = {SciPy: Open source scientific tools for Python}, year = {2001}, } """ style = prompt_toolkit.styles.style_from_pygments_cls( pygments.styles.get_style_by_name(cm.get('style'))) if labels is None: labels = ["" for _ in bibs] tokens = [(Token.Comment, u.BANNER)] for label, bib in zip(labels, bibs): tokens += [(Token.Text, label)] tokens += list(pygments.lex(bib.content, lexer=BibTeXLexer())) tokens += [(Token.Text, "\n")] print_formatted_text(PygmentsTokens(tokens), end="", style=style)
def __iter__(self): """Parse self.code and yield "classified" tokens. """ if self.lexer is None: yield ([], self.code) return tokens = pygments.lex(self.code, self.lexer) for tokentype, value in self.merge(tokens): if self.tokennames == 'long': # long CSS class args classes = str(tokentype).lower().split('.') else: # short CSS class args classes = [_get_ttype_class(tokentype)] classes = [cls for cls in classes if cls not in unstyled_tokens] yield (classes, value)
def print_packets(path: list, nodes: dict) -> None: tokens = [] for e in path[:-1]: node = nodes[e.dst] p = node.render() line = '{} = {}'.format(node.name.replace('-', '_'), repr(p)) tokens.extend(list(pygments.lex(line, lexer=Python3Lexer()))) # p = self.fuzz_node.render() node = nodes[path[-1].dst] p = node.render() line = '{} = {}'.format(node.name.replace('-', '_'), repr(p)) print(pygments.highlight(line, Python3Lexer(), Terminal256Formatter(style='rrt')))
def _findOneToken(tokenToFind, code): """ Find a token in the code and returns a list of lines where it was found """ lexer = PythonLexer() tokens = pygments.lex(code, lexer) lineNumber = 1 whereFound = [] for token in tokens: lineNumber += _getNewLines(token) if _sameToken(token, tokenToFind): whereFound.append(lineNumber) return whereFound
def get_tokens(file: str, lang: str) -> Counter: """ Gather a Counter object of tokens in the file and their count. :param file: the path to the file. :param lang: the language of file. :return: a Counter object of items: token and count. """ content = PygmentsParser.read_file(file) tokens = [] for pair in pygments.lex(content, PygmentsParser.LEXERS[lang]): if any(pair[0] in sublist for sublist in PygmentsParser.TYPES[lang]): tokens.extend(list(Subtokenizer.process_token(pair[1]))) return Counter(tokens)
def convert_text(text: str, extension: str) -> List[ParsedToken]: extension = extension or 'java' if extension: try: lexer = get_lexer_by_name(extension) except ClassNotFound as err: logger.warning(err) lexer = guess_lexer(text) else: lexer = guess_lexer(text) for token, value in lex(text, lexer): model_tokens = _convert(token, value) for mr in model_tokens: yield mr
def parse_string(self, s): """ Parse string using lexer, if none exists return string with default text color """ start = 0 ret_list = [] if self.lexer is None: return ([(0, s, options['text_color'])]) for token in lex(s, self.lexer): color = self.determine_color(token[0]) ret_list.append((start, token[1], color)) start += len(token[1]) return ret_list
def highlight2(self, event=None): """Highlight the syntax of the current line""" text_widget = self.get_current() row = text_widget.index('insert').split('.')[0] self.remove_tags2(1) content = text_widget.get("1.0", 'end') # lines = content.split("\n") text_widget.mark_set("range_start", "1" + ".0") data = text_widget.get("1.0", "end") for token, content in lex(data, Python3Lexer()): text_widget.mark_set("range_end", "range_start + %dc" % len(content)) text_widget.tag_add(str(token), "range_start", "range_end") text_widget.mark_set("range_start", "range_end") self.tag_conf()
def code_token(code_diff): # print(code_diff) # print('---------------------------------------') code_diff = code_prepare(code_diff) # print(code_diff) # print('==========================================') # print(lexers.guess_lexer(code_diff)) lexer = lexers.get_lexer_by_name("java", stripall=True) tokens = list(pygments.lex(code_diff, lexer)) # tokens = list(javalang.tokenizer.tokenize(code_diff)) tokens_list = [] for token in tokens: if str(token[0]) != 'Token.Text' and str(token[0]) != 'Token.Punctuation': tokens_list.append(token[1].lower()) return tokens_list
def findDecorators(code): """ Look for decorators @ Documentation: Python Pocket Reference page 67 """ decorators = PythonIdiom('decorator') lexer = PythonLexer() tokens = pygments.lex(code, lexer) lineNumber = 1 for ttype, word in tokens: lineNumber += _getNewLines((ttype, word)) if ttype is Token.Name.Decorator: decorators.addNew(lineNumber) log ("Decorators found in lines: " + str(decorators.getLines())) return decorators
def python_lexer(self): for tag in self.text_area.tag_names(): self.text_area.tag_delete(tag) self._set_text_tags() data = self.text_area.get("1.0", "end-1c") self.text_area.mark_set("range_start", "1.0") print("------------------") for token, content in lex(data, PythonLexer()): master_token = ".".join(str(token).split(".")[0:2]) self.text_area.mark_set("range_end", "range_start + %dc" % len(content)) self.text_area.tag_add(str(master_token), "range_start", "range_end") print(token, len(content), content.encode()) self.text_area.mark_set("range_start", "range_end")
def default_highlight(self): row = float(self.text.index(tk.INSERT)) row = str(math.trunc(row)) content = self.text.get("1.0", tk.END) lines = content.split("\n") if (self.previousContent != content): self.text.mark_set("range_start", row + ".0") data = self.text.get(row + ".0", row + "." + str(len(lines[int(row) - 1]))) for token, content in lex(data, self.lexer): self.text.mark_set("range_end", "range_start + %dc" % len(content)) self.text.tag_add(str(token), "range_start", "range_end") self.text.mark_set("range_start", "range_end") self.previousContent = self.text.get("1.0", tk.END)
def run(self): """ Lexes the data to see what lexers can tokenize it. Any successful lexers are considered possible matches. """ bad_tokens = (Token.Text, Token.Name, Token.Name.Other) tokens = [ tok for tok, text in lex(self.data_string, self.lexer) if tok not in bad_tokens and text != '' ] token_count = len(tokens) # Errors mean we definitely didn't find the right language if Token.Error in tokens or token_count == 0: self.result = False else: self.result = token_count