def find_line_offsets(self): """ Construct the L{token_line_offsets} table from C{self.text}. """ # line 0 doesn't exist; line 1 starts at char offset 0. self.token_line_offsets = [None, 0] self.input_line_offsets = [None, 0] # Find all newlines in `text`, and add an entry to # token_line_offsets for each one. total = 0 bprev = 0 bnext = self.text.find(six.b('\n')) + 1 while bnext > 0: line = self.text[bprev:bnext] # includes \n if six.binary_type is not str: line = line.decode(self.coding) linelen = len(line) total += linelen pos = self.token_line_offsets[-1] + linelen self.token_line_offsets.append(pos) self.input_line_offsets.append(bnext) bprev = bnext bnext = self.text.find(six.b('\n'), bnext) + 1 tail = self.text[bprev:] if six.binary_type is not str: tail = tail.decode(self.coding) total += len(tail) # Add a final entry, marking the end of the string. self.token_line_offsets.append(total) self.input_line_offsets.append(len(self.text))
def _colorize_re_flags(self, flags, state): if flags: flags = [ c for (c, n) in sorted(sre_parse.FLAGS.items()) if (n & flags) ] flags = six.b('(?%s)') % six.b(''.join(flags)) self._output(flags, self.RE_FLAGS_TAG, state)
def _tigetstr(self, cap_name): # String capabilities can include "delays" of the form "$<2>". # For any modern terminal, we should be able to just ignore # these, so strip them out. import curses cap = curses.tigetstr(cap_name) or six.b('') cap = re.sub(six.b(r'\$<\d+>[/*]?'), six.b(''), cap) if six.binary_type is not str: cap = cap.decode('ascii') return cap
def _colorize_dict(self, items, state, prefix, suffix): self._output(prefix, self.GROUP_TAG, state) indent = state.charpos for i, (key, val) in enumerate(items): if i>=1: if state.linebreakok: self._output(six.b(','), self.COMMA_TAG, state) self._output(six.b('\n')+six.b(' ')*indent, None, state) else: self._output(six.b(', '), self.COMMA_TAG, state) self._colorize(key, state) self._output(six.b(': '), self.COLON_TAG, state) self._colorize(val, state) self._output(suffix, self.GROUP_TAG, state)
def _colorize_re(self, pyval, state): # Extract the flag & pattern from the regexp. pat, flags = pyval.pattern, pyval.flags # If the pattern is a string, decode it to unicode. ##if isinstance(pat, six.binary_type): ## pat = decode_with_backslashreplace(pat) # Parse the regexp pattern. tree = sre_parse.parse(pat, flags) groups = dict([(num,name) for (name,num) in tree.pattern.groupdict.items()]) # Colorize it! self._output(six.b("re.compile(r'"), None, state) self._colorize_re_flags(flags, state) self._colorize_re_tree(tree, state, True, groups) self._output(six.b("')"), None, state)
def _colorize_re(self, pyval, state): # Extract the flag & pattern from the regexp. pat, flags = pyval.pattern, pyval.flags # If the pattern is a string, decode it to unicode. ##if isinstance(pat, six.binary_type): ## pat = decode_with_backslashreplace(pat) # Parse the regexp pattern. tree = sre_parse.parse(pat, flags) groups = dict([(num, name) for (name, num) in tree.pattern.groupdict.items()]) # Colorize it! self._output(six.b("re.compile(r'"), None, state) self._colorize_re_flags(flags, state) self._colorize_re_tree(tree, state, True, groups) self._output(six.b("')"), None, state)
def _colorize_dict(self, items, state, prefix, suffix): self._output(prefix, self.GROUP_TAG, state) indent = state.charpos for i, (key, val) in enumerate(items): if i >= 1: if state.linebreakok: self._output(six.b(','), self.COMMA_TAG, state) self._output( six.b('\n') + six.b(' ') * indent, None, state) else: self._output(six.b(', '), self.COMMA_TAG, state) self._colorize(key, state) self._output(six.b(': '), self.COLON_TAG, state) self._colorize(val, state) self._output(suffix, self.GROUP_TAG, state)
def _colorize(self, pyval, state): pyval_type = type(pyval) state.score += 1 if pyval is None or pyval is True or pyval is False: self._output(six.text_type(pyval), self.CONST_TAG, state) elif pyval_type in six.integer_types + (float, complex): self._output(six.text_type(pyval), self.NUMBER_TAG, state) elif pyval_type is str: self._colorize_str(pyval, state, '', self._str_escape) elif pyval_type is six.binary_type: self._colorize_str(pyval, state, six.b('b'), self._bytes_escape) elif pyval_type is six.text_type: self._colorize_str(pyval, state, six.u('u'), self._unicode_escape) elif pyval_type is list: self._multiline(self._colorize_iter, pyval, state, '[', ']') elif pyval_type is tuple: self._multiline(self._colorize_iter, pyval, state, '(', ')') elif pyval_type is set: self._multiline(self._colorize_iter, self._sort(pyval), state, 'set([', '])') elif pyval_type is frozenset: self._multiline(self._colorize_iter, self._sort(pyval), state, 'frozenset([', '])') elif pyval_type is dict: self._multiline(self._colorize_dict, self._sort(list(pyval.items())), state, '{', '}') elif is_re_pattern(pyval): self._colorize_re(pyval, state) else: try: pyval_repr = repr(pyval) if not isinstance(pyval_repr, str): pyval_repr = str(pyval_repr) except KeyboardInterrupt: raise except: state.score -= 100 state.result.append(self.UNKNOWN_REPR) else: if self.GENERIC_OBJECT_RE.match(pyval_repr): state.score -= 5 self._output(pyval_repr, None, state)
def _colorize_re_tree(self, tree, state, noparen, groups): assert noparen in (True, False) try: if len(tree) > 1 and not noparen: self._output(six.b('('), self.RE_GROUP_TAG, state) except TypeError: print("tree: %r" % tree) raise for elt in tree: op = elt[0] args = elt[1] if op == sre_constants.LITERAL: c = six.unichr(args) # Add any appropriate escaping. if c in six.u('.^$\\*+?{}[]|()\''): c = six.b('\\') + six.b(c) elif c == six.u('\t'): c = six.b('\\t') elif c == six.u('\r'): c = six.b('\\r') elif c == six.u('\n'): c = six.b('\\n') elif c == six.u('\f'): c = six.b('\\f') elif c == six.u('\v'): c = six.b('\\v') elif ord(c) > 0xffff: c = six.b(r'\U%08x') % ord(c) elif ord(c) > 0xff: c = six.b(r'\u%04x') % ord(c) elif ord(c)<32 or ord(c)>=127: c = six.b(r'\x%02x') % ord(c) self._output(c, self.RE_CHAR_TAG, state) elif op == sre_constants.ANY: self._output(six.b('.'), self.RE_CHAR_TAG, state) elif op == sre_constants.BRANCH: if args[0] is not None: raise ValueError('Branch expected None arg but got %s' % args[0]) for i, item in enumerate(args[1]): if i > 0: self._output(six.b('|'), self.RE_OP_TAG, state) self._colorize_re_tree(item, state, True, groups) elif op == sre_constants.IN: if (len(args) == 1 and args[0][0] == sre_constants.CATEGORY): self._colorize_re_tree(args, state, False, groups) else: self._output(six.b('['), self.RE_GROUP_TAG, state) self._colorize_re_tree(args, state, True, groups) self._output(six.b(']'), self.RE_GROUP_TAG, state) elif op == sre_constants.CATEGORY: if args == sre_constants.CATEGORY_DIGIT: val = six.b(r'\d') elif args == sre_constants.CATEGORY_NOT_DIGIT: val = six.b(r'\D') elif args == sre_constants.CATEGORY_SPACE: val = six.b(r'\s') elif args == sre_constants.CATEGORY_NOT_SPACE: val = six.b(r'\S') elif args == sre_constants.CATEGORY_WORD: val = six.b(r'\w') elif args == sre_constants.CATEGORY_NOT_WORD: val = six.b(r'\W') else: raise ValueError('Unknown category %s' % args) self._output(val, self.RE_CHAR_TAG, state) elif op == sre_constants.AT: if args == sre_constants.AT_BEGINNING_STRING: val = six.b(r'\A') elif args == sre_constants.AT_BEGINNING: val = six.b(r'^') elif args == sre_constants.AT_END: val = six.b(r'$') elif args == sre_constants.AT_BOUNDARY: val = six.b(r'\b') elif args == sre_constants.AT_NON_BOUNDARY: val = six.b(r'\B') elif args == sre_constants.AT_END_STRING: val = six.b(r'\Z') else: raise ValueError('Unknown position %s' % args) self._output(val, self.RE_CHAR_TAG, state) elif op in (sre_constants.MAX_REPEAT, sre_constants.MIN_REPEAT): minrpt = args[0] maxrpt = args[1] if maxrpt == sre_constants.MAXREPEAT: if minrpt == 0: val = six.b('*') elif minrpt == 1: val = six.b('+') else: val = six.b('{%d,}') % (minrpt) elif minrpt == 0: if maxrpt == 1: val = six.b('?') else: val = six.b('{,%d}') % (maxrpt) elif minrpt == maxrpt: val = six.b('{%d}') % (maxrpt) else: val = six.b('{%d,%d}') % (minrpt, maxrpt) if op == sre_constants.MIN_REPEAT: val += six.b('?') self._colorize_re_tree(args[2], state, False, groups) self._output(val, self.RE_OP_TAG, state) elif op == sre_constants.SUBPATTERN: if args[0] is None: self._output(six.b('(?:'), self.RE_GROUP_TAG, state) elif args[0] in groups: self._output(six.b('(?P<'), self.RE_GROUP_TAG, state) self._output(groups[args[0]], self.RE_REF_TAG, state) self._output(six.b('>'), self.RE_GROUP_TAG, state) elif isinstance(args[0], six.integer_types): # This is cheating: self._output(six.b('('), self.RE_GROUP_TAG, state) else: self._output(six.b('(?P<'), self.RE_GROUP_TAG, state) self._output(args[0], self.RE_REF_TAG, state) self._output(six.b('>'), self.RE_GROUP_TAG, state) if six.PY2: self._colorize_re_tree(args[1], state, True, groups) else: self._colorize_re_tree(args[3], state, True, groups) self._output(six.b(')'), self.RE_GROUP_TAG, state) elif op == sre_constants.GROUPREF: self._output(six.b('\\%d') % args, self.RE_REF_TAG, state) elif op == sre_constants.RANGE: self._colorize_re_tree( ((sre_constants.LITERAL, args[0]),), state, False, groups ) self._output(six.b('-'), self.RE_OP_TAG, state) self._colorize_re_tree( ((sre_constants.LITERAL, args[1]),), state, False, groups ) elif op == sre_constants.NEGATE: self._output(six.b('^'), self.RE_OP_TAG, state) elif op == sre_constants.ASSERT: if args[0] > 0: self._output(six.b('(?='), self.RE_GROUP_TAG, state) else: self._output(six.b('(?<='), self.RE_GROUP_TAG, state) self._colorize_re_tree(args[1], state, True, groups) self._output(six.b(')'), self.RE_GROUP_TAG, state) elif op == sre_constants.ASSERT_NOT: if args[0] > 0: self._output(six.b('(?!'), self.RE_GROUP_TAG, state) else: self._output(six.b('(?<!'), self.RE_GROUP_TAG, state) self._colorize_re_tree(args[1], state, True, groups) self._output(six.b(')'), self.RE_GROUP_TAG, state) elif op == sre_constants.NOT_LITERAL: self._output(six.b('[^'), self.RE_GROUP_TAG, state) self._colorize_re_tree( ((sre_constants.LITERAL, args),), state, False, groups ) self._output(six.b(']'), self.RE_GROUP_TAG, state) else: log.error("Error colorizing regexp: unknown elt %r" % elt) if len(tree) > 1 and not noparen: self._output(six.b(')'), self.RE_GROUP_TAG, state)
def _colorize_re_flags(self, flags, state): if flags: flags = [c for (c,n) in sorted(sre_parse.FLAGS.items()) if (n&flags)] flags = six.b('(?%s)') % six.b(''.join(flags)) self._output(flags, self.RE_FLAGS_TAG, state)
def _colorize_re_tree(self, tree, state, noparen, groups): assert noparen in (True, False) try: if len(tree) > 1 and not noparen: self._output(six.b('('), self.RE_GROUP_TAG, state) except TypeError: print("tree: %r" % tree) raise for elt in tree: op = elt[0] args = elt[1] if op == sre_constants.LITERAL: c = six.unichr(args) # Add any appropriate escaping. if c in six.u('.^$\\*+?{}[]|()\''): c = six.b('\\') + six.b(c) elif c == six.u('\t'): c = six.b('\\t') elif c == six.u('\r'): c = six.b('\\r') elif c == six.u('\n'): c = six.b('\\n') elif c == six.u('\f'): c = six.b('\\f') elif c == six.u('\v'): c = six.b('\\v') elif ord(c) > 0xffff: c = six.b(r'\U%08x') % ord(c) elif ord(c) > 0xff: c = six.b(r'\u%04x') % ord(c) elif ord(c) < 32 or ord(c) >= 127: c = six.b(r'\x%02x') % ord(c) self._output(c, self.RE_CHAR_TAG, state) elif op == sre_constants.ANY: self._output(six.b('.'), self.RE_CHAR_TAG, state) elif op == sre_constants.BRANCH: if args[0] is not None: raise ValueError('Branch expected None arg but got %s' % args[0]) for i, item in enumerate(args[1]): if i > 0: self._output(six.b('|'), self.RE_OP_TAG, state) self._colorize_re_tree(item, state, True, groups) elif op == sre_constants.IN: if (len(args) == 1 and args[0][0] == sre_constants.CATEGORY): self._colorize_re_tree(args, state, False, groups) else: self._output(six.b('['), self.RE_GROUP_TAG, state) self._colorize_re_tree(args, state, True, groups) self._output(six.b(']'), self.RE_GROUP_TAG, state) elif op == sre_constants.CATEGORY: if args == sre_constants.CATEGORY_DIGIT: val = six.b(r'\d') elif args == sre_constants.CATEGORY_NOT_DIGIT: val = six.b(r'\D') elif args == sre_constants.CATEGORY_SPACE: val = six.b(r'\s') elif args == sre_constants.CATEGORY_NOT_SPACE: val = six.b(r'\S') elif args == sre_constants.CATEGORY_WORD: val = six.b(r'\w') elif args == sre_constants.CATEGORY_NOT_WORD: val = six.b(r'\W') else: raise ValueError('Unknown category %s' % args) self._output(val, self.RE_CHAR_TAG, state) elif op == sre_constants.AT: if args == sre_constants.AT_BEGINNING_STRING: val = six.b(r'\A') elif args == sre_constants.AT_BEGINNING: val = six.b(r'^') elif args == sre_constants.AT_END: val = six.b(r'$') elif args == sre_constants.AT_BOUNDARY: val = six.b(r'\b') elif args == sre_constants.AT_NON_BOUNDARY: val = six.b(r'\B') elif args == sre_constants.AT_END_STRING: val = six.b(r'\Z') else: raise ValueError('Unknown position %s' % args) self._output(val, self.RE_CHAR_TAG, state) elif op in (sre_constants.MAX_REPEAT, sre_constants.MIN_REPEAT): minrpt = args[0] maxrpt = args[1] if maxrpt == sre_constants.MAXREPEAT: if minrpt == 0: val = six.b('*') elif minrpt == 1: val = six.b('+') else: val = six.b('{%d,}') % (minrpt) elif minrpt == 0: if maxrpt == 1: val = six.b('?') else: val = six.b('{,%d}') % (maxrpt) elif minrpt == maxrpt: val = six.b('{%d}') % (maxrpt) else: val = six.b('{%d,%d}') % (minrpt, maxrpt) if op == sre_constants.MIN_REPEAT: val += six.b('?') self._colorize_re_tree(args[2], state, False, groups) self._output(val, self.RE_OP_TAG, state) elif op == sre_constants.SUBPATTERN: if args[0] is None: self._output(six.b('(?:'), self.RE_GROUP_TAG, state) elif args[0] in groups: self._output(six.b('(?P<'), self.RE_GROUP_TAG, state) self._output(groups[args[0]], self.RE_REF_TAG, state) self._output(six.b('>'), self.RE_GROUP_TAG, state) elif isinstance(args[0], six.integer_types): # This is cheating: self._output(six.b('('), self.RE_GROUP_TAG, state) else: self._output(six.b('(?P<'), self.RE_GROUP_TAG, state) self._output(args[0], self.RE_REF_TAG, state) self._output(six.b('>'), self.RE_GROUP_TAG, state) if six.PY2: self._colorize_re_tree(args[1], state, True, groups) else: self._colorize_re_tree(args[3], state, True, groups) self._output(six.b(')'), self.RE_GROUP_TAG, state) elif op == sre_constants.GROUPREF: self._output(six.b('\\%d') % args, self.RE_REF_TAG, state) elif op == sre_constants.RANGE: self._colorize_re_tree(((sre_constants.LITERAL, args[0]), ), state, False, groups) self._output(six.b('-'), self.RE_OP_TAG, state) self._colorize_re_tree(((sre_constants.LITERAL, args[1]), ), state, False, groups) elif op == sre_constants.NEGATE: self._output(six.b('^'), self.RE_OP_TAG, state) elif op == sre_constants.ASSERT: if args[0] > 0: self._output(six.b('(?='), self.RE_GROUP_TAG, state) else: self._output(six.b('(?<='), self.RE_GROUP_TAG, state) self._colorize_re_tree(args[1], state, True, groups) self._output(six.b(')'), self.RE_GROUP_TAG, state) elif op == sre_constants.ASSERT_NOT: if args[0] > 0: self._output(six.b('(?!'), self.RE_GROUP_TAG, state) else: self._output(six.b('(?<!'), self.RE_GROUP_TAG, state) self._colorize_re_tree(args[1], state, True, groups) self._output(six.b(')'), self.RE_GROUP_TAG, state) elif op == sre_constants.NOT_LITERAL: self._output(six.b('[^'), self.RE_GROUP_TAG, state) self._colorize_re_tree(((sre_constants.LITERAL, args), ), state, False, groups) self._output(six.b(']'), self.RE_GROUP_TAG, state) else: log.error("Error colorizing regexp: unknown elt %r" % elt) if len(tree) > 1 and not noparen: self._output(six.b(')'), self.RE_GROUP_TAG, state)
def colorize(self): """ Return an HTML string that renders the source code for the module that was specified in the constructor. """ # Initialize all our state variables self.token_pos = 0 self.input_pos = 0 self.cur_line = [] self.context = [] self.context_types = [] self.indents = [] self.lineno = 1 self.def_name = None self.def_type = None self.has_decorators = False # Cache, used so we only need to list the target elements once # for each variable. self.doclink_targets_cache = {} # Load the module's text. self.text = open(self.module_filename, 'rb').read() self.text = self.text.expandtabs(self.tab_width).rstrip() + six.b('\n') # Determine encoding. if six.PY2: do_tokenize = tokenize.tokenize m = self.UNICODE_CODING_RE.match(self.text) if m: self.coding = m.group(1) else: self.coding = 'iso-8859-1' else: coding, _ = tokenize.detect_encoding( six.BytesIO(self.text).readline) if coding.lower() == 'utf-8-sig': coding = 'utf-8' self.coding = coding def do_tokenize(readfcn, tokeneater): for tok in tokenize.tokenize(readfcn): self.tokeneater(*tok) if self.coding is None: raise ValueError("coding is None: %s" % repr(self.text)) # Construct the token_line_offsets table. self.find_line_offsets() num_lines = self.text.count(six.b('\n')) + 1 self.linenum_size = len(repr(num_lines + 1)) output = six.StringIO() self.out = output.write if six.binary_type is not str: readline = six.BytesIO(self.text).readline else: readline = six.StringIO(self.text).readline # Call the tokenizer, and send tokens to our `tokeneater()` # method. If anything goes wrong, then fall-back to using # the input text as-is (with no colorization). try: do_tokenize(readline, self.tokeneater) except tokenize.TokenError as ex: html = self.text else: html = output.getvalue() if self.has_decorators: html = self._FIX_DECORATOR_RE.sub(r'\2\1', html) # Check for a unicode encoding declaration. if isinstance(html, six.binary_type): # Decode the html string into unicode, and then encode it back # into ascii, replacing any non-ascii characters with xml # character references. try: html = html.decode(self.coding) except LookupError: coding = 'iso-8859-1' try: html = html.decode(coding) except UnicodeDecodeError as e: log.warning( "Unicode error while generating syntax-highlighted " "source code: %s (%s)" % (e, self.module_filename)) html = html.decode(coding, 'ignore') html = html.encode('ascii', 'xmlcharrefreplace') # Call expandto. html += PYSRC_EXPANDTO_JAVASCRIPT return html
class PythonSourceColorizer: """ A class that renders a python module's source code into HTML pages. These HTML pages are intended to be provided along with the API documentation for a module, in case a user wants to learn more about a particular object by examining its source code. Links are therefore generated from the API documentation to the source code pages, and from the source code pages back into the API documentation. The HTML generated by C{PythonSourceColorizer} has several notable features: - CSS styles are used to color tokens according to their type. (See L{CSS_CLASSES} for a list of the different token types that are identified). - Line numbers are included to the left of each line. - The first line of each class and function definition includes a link to the API source documentation for that object. - The first line of each class and function definition includes an anchor that can be used to link directly to that class or function. - If javascript is enabled, and the page is loaded using the anchor for a class or function (i.e., if the url ends in C{'#I{<name>}'}), then that class or function will automatically be highlighted; and all other classes and function definition blocks will be 'collapsed'. These collapsed blocks can be expanded by clicking on them. - Unicode input is supported (including automatic detection of C{'coding:'} declarations). """ #: A look-up table that is used to determine which CSS class #: should be used to colorize a given token. The following keys #: may be used: #: - Any token name (e.g., C{'STRING'}) #: - Any operator token (e.g., C{'='} or C{'@'}). #: - C{'KEYWORD'} -- Python keywords such as C{'for'} and C{'if'} #: - C{'DEFNAME'} -- the name of a class or function at the top #: of its definition statement. #: - C{'BASECLASS'} -- names of base classes at the top of a class #: definition statement. #: - C{'PARAM'} -- function parameters #: - C{'DOCSTRING'} -- docstrings #: - C{'DECORATOR'} -- decorator names #: If no CSS class can be found for a given token, then it won't #: be marked with any CSS class. CSS_CLASSES = { 'NUMBER': 'py-number', 'STRING': 'py-string', 'COMMENT': 'py-comment', 'NAME': 'py-name', 'KEYWORD': 'py-keyword', 'DEFNAME': 'py-def-name', 'BASECLASS': 'py-base-class', 'PARAM': 'py-param', 'DOCSTRING': 'py-docstring', 'DECORATOR': 'py-decorator', 'OP': 'py-op', '@': 'py-decorator', } #: HTML code for the beginning of a collapsable function or class #: definition block. The block contains two <div>...</div> #: elements -- a collapsed version and an expanded version -- and #: only one of these elements is visible at any given time. By #: default, all definition blocks are expanded. #: #: This string should be interpolated with the following values:: #: (name, indentation, name) #: Where C{name} is the anchor name for the function or class; and #: indentation is a string of whitespace used to indent the #: ellipsis marker in the collapsed version. START_DEF_BLOCK = ('<div id="%s-collapsed" style="display:none;" ' 'pad="%s" indent="%s"></div>' '<div id="%s-expanded">') #: HTML code for the end of a collapsable function or class #: definition block. END_DEF_BLOCK = '</div>' #: A regular expression used to pick out the unicode encoding for #: the source file. UNICODE_CODING_RE = re.compile(six.b(r'.*?\n?.*?coding[:=]\s*([-\w.]+)')) #: A configuration constant, used to determine whether or not to add #: collapsable <div> elements for definition blocks. ADD_DEF_BLOCKS = True #: A configuration constant, used to determine whether or not to #: add line numbers. ADD_LINE_NUMBERS = True #: A configuration constant, used to determine whether or not to #: add tooltips for linked names. ADD_TOOLTIPS = True #: If true, then try to guess which target is appropriate for #: linked names; if false, then always open a div asking the #: user which one they want. GUESS_LINK_TARGETS = False def __init__(self, module_filename, module_name, docindex=None, url_func=None, name_to_docs=None, tab_width=8): """ Create a new HTML colorizer for the specified module. @param module_filename: The name of the file containing the module; its text will be loaded from this file. @param module_name: The dotted name of the module; this will be used to create links back into the API source documentation. """ # Get the source version, if possible. try: module_filename = py_src_filename(module_filename) except: pass #: The filename of the module we're colorizing. self.module_filename = module_filename #: The dotted name of the module we're colorizing. self.module_name = module_name #: A docindex, used to create href links from identifiers to #: the API documentation for their values. self.docindex = docindex #: A mapping from short names to lists of ValueDoc, used to #: decide which values an identifier might map to when creating #: href links from identifiers to the API docs for their values. self.name_to_docs = name_to_docs #: A function that maps APIDoc -> URL, used to create href #: links from identifiers to the API documentation for their #: values. self.url_func = url_func #: Encoding of input text self.coding = None #: The index in C{text} of the last character of the last #: token we've processed. self.input_pos = 0 #: The index in tokenizer output stream of the last character #: of the last token we've processed. self.token_pos = 0 #: A list that maps line numbers to character offsets in #: C{text}. In particular, line C{M{i}} begins at character #: C{line_offset[i]} in C{text}. Since line numbers begin at #: 1, the first element of C{token_line_offsets} is C{None}. self.input_line_offsets = [] #: A list that maps line numbers to character offsets in #: an output text from tokenizer. These values are consistent #: with line/column counts returned from tokenizer (which in #: python3 refer to the decoded string). self.token_line_offsets = [] #: A list of C{(toktype, toktext)} for all tokens on the #: logical line that we are currently processing. Once a #: complete line of tokens has been collected in C{cur_line}, #: it is sent to L{handle_line} for processing. self.cur_line = [] #: A list of the names of the class or functions that include #: the current block. C{context} has one element for each #: level of indentation; C{context[i]} is the name of the class #: or function defined by the C{i}th level of indentation, or #: C{None} if that level of indentation doesn't correspond to a #: class or function definition. self.context = [] #: A list, corresponding one-to-one with L{self.context}, #: indicating the type of each entry. Each element of #: C{context_types} is one of: C{'func'}, C{'class'}, C{None}. self.context_types = [] #: A list of indentation strings for each of the current #: block's indents. I.e., the current total indentation can #: be found by taking C{''.join(self.indents)}. self.indents = [] #: The line number of the line we're currently processing. self.lineno = 0 #: The name of the class or function whose definition started #: on the previous logical line, or C{None} if the previous #: logical line was not a class or function definition. self.def_name = None #: The type of the class or function whose definition started #: on the previous logical line, or C{None} if the previous #: logical line was not a class or function definition. #: Can be C{'func'}, C{'class'}, C{None}. self.def_type = None #: The number of spaces to replace each tab in source code with self.tab_width = tab_width def find_line_offsets(self): """ Construct the L{token_line_offsets} table from C{self.text}. """ # line 0 doesn't exist; line 1 starts at char offset 0. self.token_line_offsets = [None, 0] self.input_line_offsets = [None, 0] # Find all newlines in `text`, and add an entry to # token_line_offsets for each one. total = 0 bprev = 0 bnext = self.text.find(six.b('\n')) + 1 while bnext > 0: line = self.text[bprev:bnext] # includes \n if six.binary_type is not str: line = line.decode(self.coding) linelen = len(line) total += linelen pos = self.token_line_offsets[-1] + linelen self.token_line_offsets.append(pos) self.input_line_offsets.append(bnext) bprev = bnext bnext = self.text.find(six.b('\n'), bnext) + 1 tail = self.text[bprev:] if six.binary_type is not str: tail = tail.decode(self.coding) total += len(tail) # Add a final entry, marking the end of the string. self.token_line_offsets.append(total) self.input_line_offsets.append(len(self.text)) def lineno_to_html(self): template = '%%%dd' % self.linenum_size n = template % self.lineno return '<a name="L%d"></a><tt class="py-lineno">%s</tt>' \ % (self.lineno, n) def colorize(self): """ Return an HTML string that renders the source code for the module that was specified in the constructor. """ # Initialize all our state variables self.token_pos = 0 self.input_pos = 0 self.cur_line = [] self.context = [] self.context_types = [] self.indents = [] self.lineno = 1 self.def_name = None self.def_type = None self.has_decorators = False # Cache, used so we only need to list the target elements once # for each variable. self.doclink_targets_cache = {} # Load the module's text. self.text = open(self.module_filename, 'rb').read() self.text = self.text.expandtabs(self.tab_width).rstrip() + six.b('\n') # Determine encoding. if six.PY2: do_tokenize = tokenize.tokenize m = self.UNICODE_CODING_RE.match(self.text) if m: self.coding = m.group(1) else: self.coding = 'iso-8859-1' else: coding, _ = tokenize.detect_encoding( six.BytesIO(self.text).readline) if coding.lower() == 'utf-8-sig': coding = 'utf-8' self.coding = coding def do_tokenize(readfcn, tokeneater): for tok in tokenize.tokenize(readfcn): self.tokeneater(*tok) if self.coding is None: raise ValueError("coding is None: %s" % repr(self.text)) # Construct the token_line_offsets table. self.find_line_offsets() num_lines = self.text.count(six.b('\n')) + 1 self.linenum_size = len(repr(num_lines + 1)) output = six.StringIO() self.out = output.write if six.binary_type is not str: readline = six.BytesIO(self.text).readline else: readline = six.StringIO(self.text).readline # Call the tokenizer, and send tokens to our `tokeneater()` # method. If anything goes wrong, then fall-back to using # the input text as-is (with no colorization). try: do_tokenize(readline, self.tokeneater) except tokenize.TokenError as ex: html = self.text else: html = output.getvalue() if self.has_decorators: html = self._FIX_DECORATOR_RE.sub(r'\2\1', html) # Check for a unicode encoding declaration. if isinstance(html, six.binary_type): # Decode the html string into unicode, and then encode it back # into ascii, replacing any non-ascii characters with xml # character references. try: html = html.decode(self.coding) except LookupError: coding = 'iso-8859-1' try: html = html.decode(coding) except UnicodeDecodeError as e: log.warning( "Unicode error while generating syntax-highlighted " "source code: %s (%s)" % (e, self.module_filename)) html = html.decode(coding, 'ignore') html = html.encode('ascii', 'xmlcharrefreplace') # Call expandto. html += PYSRC_EXPANDTO_JAVASCRIPT return html def tokeneater(self, toktype, toktext, srowcol, erowcol, line): """ A callback function used by C{tokenize.tokenize} to handle each token in the module. C{tokeneater} collects tokens into the C{self.cur_line} list until a complete logical line has been formed; and then calls L{handle_line} to process that line. """ srow, scol = srowcol erow, ecol = erowcol # If we encounter any errors, then just give up. if toktype == token.ERRORTOKEN: raise tokenize.TokenError(toktype) if hasattr(tokenize, 'ENCODING') and toktype == tokenize.ENCODING: if self.coding is None: self.coding = toktext return token_startpos = self.token_line_offsets[srow] + scol if six.binary_type is str: input_startpos = token_startpos input_toktext = toktext else: input_scol = len(line[:scol].encode(self.coding)) input_startpos = self.input_line_offsets[srow] + input_scol input_toktext = toktext.encode(self.coding) # Did we skip anything whitespace? If so, add a pseudotoken # for it, with toktype=None. (Note -- this skipped string # might also contain continuation slashes; but I won't bother # to colorize them.) if input_startpos > self.input_pos: skipped = self.text[self.input_pos:input_startpos] if six.binary_type is not str: skipped = skipped.decode(self.coding) self.cur_line.append((None, skipped)) # Update our position. self.token_pos = token_startpos + len(toktext) self.input_pos = input_startpos + len(input_toktext) # Update our current line. self.cur_line.append((toktype, toktext)) # When we reach the end of a line, process it. if toktype == token.NEWLINE or toktype == token.ENDMARKER: self.handle_line(self.cur_line) self.cur_line = [] _next_uid = 0 # [xx] note -- this works with byte strings, not unicode strings! # I may change it to use unicode eventually, but when I do it # needs to be changed all at once. def handle_line(self, line): """ Render a single logical line from the module, and write the generated HTML to C{self.out}. @param line: A single logical line, encoded as a list of C{(toktype,tokttext)} pairs corresponding to the tokens in the line. """ # def_name is the name of the function or class defined by # this line; or None if no funciton or class is defined. def_name = None # def_type is the type of the function or class defined by # this line; or None if no funciton or class is defined. def_type = None # does this line start a class/func def? starting_def_block = False in_base_list = False in_param_list = False in_param_default = 0 at_module_top = (self.lineno == 1) ended_def_blocks = 0 # The html output. if self.ADD_LINE_NUMBERS: s = self.lineno_to_html() self.lineno += 1 else: s = '' s += ' <tt class="py-line">' # Loop through each token, and colorize it appropriately. for i, (toktype, toktext) in enumerate(line): if type(s) is not str: if type(s) is six.text_type: # only PY2 -> unicode log.error('While colorizing %s -- got unexpected ' 'unicode string' % self.module_name) s = s.encode('ascii', 'xmlcharrefreplace') elif type(s) is six.binary_type: # only PY3 -> bytes log.error('While colorizing %s -- got unexpected ' 'binary string' % self.module_name) s = decode_with_backslashreplace(s) else: raise ValueError('Unexpected value for s -- %s' % type(s).__name__) # For each token, determine its css class and whether it # should link to a url. css_class = None url = None tooltip = None onclick = uid = targets = None # these 3 are used together. # Is this token the class name in a class definition? If # so, then make it a link back into the API docs. if i >= 2 and line[i - 2][1] == 'class': in_base_list = True css_class = self.CSS_CLASSES['DEFNAME'] def_name = toktext def_type = 'class' if 'func' not in self.context_types: cls_name = self.context_name(def_name) url = self.name2url(cls_name) s = self.mark_def(s, cls_name) starting_def_block = True # Is this token the function name in a function def? If # so, then make it a link back into the API docs. elif i >= 2 and line[i - 2][1] == 'def': in_param_list = True css_class = self.CSS_CLASSES['DEFNAME'] def_name = toktext def_type = 'func' if 'func' not in self.context_types: cls_name = self.context_name() func_name = self.context_name(def_name) url = self.name2url(cls_name, def_name) s = self.mark_def(s, func_name) starting_def_block = True # For each indent, update the indents list (which we use # to keep track of indentation strings) and the context # list. If this indent is the start of a class or # function def block, then self.def_name will be its name; # otherwise, it will be None. elif toktype == token.INDENT: self.indents.append(toktext) self.context.append(self.def_name) self.context_types.append(self.def_type) # When we dedent, pop the last elements off the indents # list and the context list. If the last context element # is a name, then we're ending a class or function def # block; so write an end-div tag. elif toktype == token.DEDENT: self.indents.pop() self.context_types.pop() if self.context.pop(): ended_def_blocks += 1 # If this token contains whitespace, then don't bother to # give it a css tag. elif toktype in (None, tokenize.NL, token.NEWLINE, token.ENDMARKER): css_class = None # Check if the token is a keyword. elif toktype == token.NAME and keyword.iskeyword(toktext): css_class = self.CSS_CLASSES['KEYWORD'] elif in_base_list and toktype == token.NAME: css_class = self.CSS_CLASSES['BASECLASS'] elif (in_param_list and toktype == token.NAME and not in_param_default): css_class = self.CSS_CLASSES['PARAM'] # Class/function docstring. elif (self.def_name and line[i - 1][0] == token.INDENT and self.is_docstring(line, i)): css_class = self.CSS_CLASSES['DOCSTRING'] # Module docstring. elif at_module_top and self.is_docstring(line, i): css_class = self.CSS_CLASSES['DOCSTRING'] # check for decorators?? elif (toktype == token.NAME and ( (i > 0 and line[i - 1][1] == '@') or (i > 1 and line[i - 1][0] == None and line[i - 2][1] == '@'))): css_class = self.CSS_CLASSES['DECORATOR'] self.has_decorators = True # If it's a name, try to link it. elif toktype == token.NAME: css_class = self.CSS_CLASSES['NAME'] # If we have a variable named `toktext` in the current # context, then link to that. Note that if we're inside # a function, then that function is our context, not # the namespace that contains it. [xx] this isn't always # the right thing to do. if (self.GUESS_LINK_TARGETS and self.docindex is not None and self.url_func is not None): context = [n for n in self.context if n is not None] container = self.docindex.get_vardoc( DottedName(self.module_name, *context)) if isinstance(container, NamespaceDoc): doc = container.variables.get(toktext) if doc is not None: url = self.url_func(doc) tooltip = str(doc.canonical_name) # Otherwise, check the name_to_docs index to see what # else this name might refer to. if (url is None and self.name_to_docs is not None and self.url_func is not None): docs = self.name_to_docs.get(toktext) if docs: tooltip = '\n'.join( [str(d.canonical_name) for d in docs]) if len(docs) == 1 and self.GUESS_LINK_TARGETS: url = self.url_func(docs[0]) else: uid, onclick, targets = self.doclink(toktext, docs) # For all other tokens, look up the CSS class to use # based on the token's type. else: if toktype == token.OP and toktext in self.CSS_CLASSES: css_class = self.CSS_CLASSES[toktext] elif token.tok_name[toktype] in self.CSS_CLASSES: css_class = self.CSS_CLASSES[token.tok_name[toktype]] else: css_class = None # update our status.. if toktext == ':': in_base_list = False in_param_list = False if toktext == '=' and in_param_list: in_param_default = True if in_param_default: if toktext in ('(', '[', '{'): in_param_default += 1 if toktext in (')', ']', '}'): in_param_default -= 1 if toktext == ',' and in_param_default == 1: in_param_default = 0 # Write this token, with appropriate colorization. if tooltip and self.ADD_TOOLTIPS: tooltip_html = ' title="%s"' % tooltip else: tooltip_html = '' if css_class: css_class_html = ' class="%s"' % css_class else: css_class_html = '' if onclick: if targets: targets_html = ' targets="%s"' % targets else: targets_html = '' s += ('<tt id="%s"%s%s><a%s%s href="#" onclick="%s">' % (uid, css_class_html, targets_html, tooltip_html, css_class_html, onclick)) elif url: if isinstance(url, six.text_type): url = url.encode('ascii', 'xmlcharrefreplace') s += ('<a%s%s href="%s">' % (tooltip_html, css_class_html, url)) elif css_class_html or tooltip_html: s += '<tt%s%s>' % (tooltip_html, css_class_html) if i == len(line) - 1: s += ' </tt>' # Closes <tt class="py-line"> s += cgi.escape(toktext) else: try: s += self.add_line_numbers(cgi.escape(toktext), css_class) except Exception as e: print((toktext, css_class, toktext.encode('ascii'))) raise if onclick: s += "</a></tt>" elif url: s += '</a>' elif css_class_html or tooltip_html: s += '</tt>' if self.ADD_DEF_BLOCKS: for i in range(ended_def_blocks): self.out(self.END_DEF_BLOCK) # Strip any empty <tt>s. s = re.sub(r'<tt class="[\w+]"></tt>', '', s) # Write the line. self.out(s) if def_name and starting_def_block: self.out('</div>') # Add div's if we're starting a def block. if (self.ADD_DEF_BLOCKS and def_name and starting_def_block and (line[-2][1] == ':')): indentation = (''.join(self.indents) + ' ').replace(' ', '+') linenum_padding = '+' * self.linenum_size name = self.context_name(def_name) self.out(self.START_DEF_BLOCK % (name, linenum_padding, indentation, name)) self.def_name = def_name self.def_type = def_type def context_name(self, extra=None): pieces = [n for n in self.context if n is not None] if extra is not None: pieces.append(extra) return '.'.join(pieces) def doclink(self, name, docs): uid = 'link-%s' % self._next_uid self._next_uid += 1 context = [n for n in self.context if n is not None] container = DottedName(self.module_name, *context) #else: # container = None targets = ','.join([ '%s=%s' % (str(self.doc_descr(d, container)), str(self.url_func(d))) for d in docs ]) if targets in self.doclink_targets_cache: onclick = ("return doclink('%s', '%s', '%s');" % (uid, name, self.doclink_targets_cache[targets])) return uid, onclick, None else: self.doclink_targets_cache[targets] = uid onclick = ("return doclink('%s', '%s', '%s');" % (uid, name, uid)) return uid, onclick, targets def doc_descr(self, doc, context): name = str(doc.canonical_name) descr = '%s %s' % (self.doc_kind(doc), name) if isinstance(doc, RoutineDoc): descr += '()' return descr # [XX] copied streight from html.py; this should be consolidated, # probably into apidoc. def doc_kind(self, doc): if isinstance(doc, ModuleDoc) and doc.is_package == True: return 'Package' elif (isinstance(doc, ModuleDoc) and doc.canonical_name[0].startswith('script')): return 'Script' elif isinstance(doc, ModuleDoc): return 'Module' elif isinstance(doc, ClassDoc): return 'Class' elif isinstance(doc, ClassMethodDoc): return 'Class Method' elif isinstance(doc, StaticMethodDoc): return 'Static Method' elif isinstance(doc, RoutineDoc): if (self.docindex is not None and isinstance(self.docindex.container(doc), ClassDoc)): return 'Method' else: return 'Function' else: return 'Variable' def mark_def(self, s, name): replacement = ('<a name="%s"></a><div id="%s-def">\\1' '<a class="py-toggle" href="#" id="%s-toggle" ' 'onclick="return toggle(\'%s\');">-</a>\\2' % (name, name, name, name)) return re.sub('(.*) (<tt class="py-line">.*)\Z', replacement, s) def is_docstring(self, line, i): if line[i][0] != token.STRING: return False for toktype, toktext in line[i:]: if toktype not in (token.NEWLINE, tokenize.COMMENT, tokenize.NL, token.STRING, None): return False return True def add_line_numbers(self, s, css_class): result = '' start = 0 end = s.find('\n') + 1 while end: result += s[start:end - 1] if css_class: result += '</tt>' result += ' </tt>' # py-line result += '\n' if self.ADD_LINE_NUMBERS: result += self.lineno_to_html() result += ' <tt class="py-line">' if css_class: result += '<tt class="%s">' % css_class start = end end = s.find('\n', end) + 1 self.lineno += 1 result += s[start:] return result def name2url(self, class_name, func_name=None): if class_name: class_name = '%s.%s' % (self.module_name, class_name) if func_name: return '%s-class.html#%s' % (class_name, func_name) else: return '%s-class.html' % class_name else: return '%s-module.html#%s' % (self.module_name, func_name) #: A regexp used to move the <div> that marks the beginning of a #: function or method to just before the decorators. _FIX_DECORATOR_RE = re.compile( r'((?:^<a name="L\d+"></a><tt class="py-lineno">\s*\d+</tt>' r'\s*<tt class="py-line">(?:<tt class="py-decorator">.*|\s*</tt>|' r'\s*<tt class="py-comment">.*)\n)+)' r'(<a name="\w+"></a><div id="\w+-def">)', re.MULTILINE)