def __init__(self, source, modname, srcname, decoded=False): # name of the module self.modname = modname # name of the source file self.srcname = srcname # file-like object yielding source lines self.source = source # cache the source code as well pos = self.source.tell() if not decoded: self.encoding = detect_encoding(self.source.readline) self.source.seek(pos) self.code = self.source.read().decode(self.encoding) self.source.seek(pos) self.source = TextIOWrapper(self.source, self.encoding) else: self.encoding = None self.code = self.source.read() self.source.seek(pos) # will be filled by tokenize() self.tokens = None # will be filled by parse() self.parsetree = None # will be filled by find_attr_docs() self.attr_docs = None self.tagorder = None # will be filled by find_tags() self.tags = None
def check(): # check for various conditions without bothering the network if len(uri) == 0 or uri[0] == '#' or \ uri[0:7] == 'mailto:' or uri[0:4] == 'ftp:': return 'unchecked', '', 0 elif not (uri[0:5] == 'http:' or uri[0:6] == 'https:'): return 'local', '', 0 elif uri in self.good: return 'working', '', 0 elif uri in self.broken: return 'broken', self.broken[uri], 0 elif uri in self.redirected: return 'redirected', self.redirected[uri][0], self.redirected[ uri][1] for rex in self.to_ignore: if rex.match(uri): return 'ignored', '', 0 if '#' in uri: req_url, hash = uri.split('#', 1) else: req_url = uri hash = None # need to actually check the URI try: if hash and self.app.config.linkcheck_anchors: # Read the whole document and see if #hash exists req = Request(req_url) f = opener.open(req, **kwargs) encoding = 'utf-8' if hasattr(f.headers, 'get_content_charset'): encoding = f.headers.get_content_charset() or encoding found = check_anchor(TextIOWrapper(f, encoding), unquote(hash)) f.close() if not found: raise Exception("Anchor '%s' not found" % hash) else: try: # try a HEAD request, which should be easier on # the server and the network req = HeadRequest(req_url) f = opener.open(req, **kwargs) f.close() except HTTPError, err: if err.code != 405: raise # retry with GET if that fails, some servers # don't like HEAD requests and reply with 405 req = Request(req_url) f = opener.open(req, **kwargs) f.close() except Exception, err: self.broken[uri] = str(err) return 'broken', str(err), 0
class ModuleAnalyzer(object): # cache for analyzer objects -- caches both by module and file name cache = {} @classmethod def for_string(cls, string, modname, srcname='<string>'): if isinstance(string, bytes): return cls(BytesIO(string), modname, srcname) return cls(StringIO(string), modname, srcname, decoded=True) @classmethod def for_file(cls, filename, modname): if ('file', filename) in cls.cache: return cls.cache['file', filename] try: fileobj = open(filename, 'rb') except Exception as err: raise PycodeError('error opening %r' % filename, err) obj = cls(fileobj, modname, filename) cls.cache['file', filename] = obj return obj @classmethod def for_module(cls, modname): if ('module', modname) in cls.cache: entry = cls.cache['module', modname] if isinstance(entry, PycodeError): raise entry return entry try: type, source = get_module_source(modname) if type == 'string': obj = cls.for_string(source, modname) else: obj = cls.for_file(source, modname) except PycodeError as err: cls.cache['module', modname] = err raise cls.cache['module', modname] = obj return obj def __init__(self, source, modname, srcname, decoded=False): # name of the module self.modname = modname # name of the source file self.srcname = srcname # file-like object yielding source lines self.source = source # cache the source code as well pos = self.source.tell() if not decoded: self.encoding = detect_encoding(self.source.readline) self.source.seek(pos) self.code = self.source.read().decode(self.encoding) self.source.seek(pos) self.source = TextIOWrapper(self.source, self.encoding) else: self.encoding = None self.code = self.source.read() self.source.seek(pos) # will be filled by tokenize() self.tokens = None # will be filled by parse() self.parsetree = None # will be filled by find_attr_docs() self.attr_docs = None self.tagorder = None # will be filled by find_tags() self.tags = None def tokenize(self): """Generate tokens from the source.""" if self.tokens is not None: return try: self.tokens = list(tokenize.generate_tokens(self.source.readline)) except tokenize.TokenError as err: raise PycodeError('tokenizing failed', err) self.source.close() def parse(self): """Parse the generated source tokens.""" if self.parsetree is not None: return self.tokenize() try: self.parsetree = pydriver.parse_tokens(self.tokens) except parse.ParseError as err: raise PycodeError('parsing failed', err) def find_attr_docs(self, scope=''): """Find class and module-level attributes and their documentation.""" if self.attr_docs is not None: return self.attr_docs self.parse() attr_visitor = AttrDocVisitor(number2name, scope, self.encoding) attr_visitor.visit(self.parsetree) self.attr_docs = attr_visitor.collected self.tagorder = attr_visitor.tagorder # now that we found everything we could in the tree, throw it away # (it takes quite a bit of memory for large modules) self.parsetree = None return attr_visitor.collected def find_tags(self): """Find class, function and method definitions and their location.""" if self.tags is not None: return self.tags self.tokenize() result = {} namespace = [] stack = [] indent = 0 defline = False expect_indent = False def tokeniter(ignore=(token.COMMENT, token.NL)): for tokentup in self.tokens: if tokentup[0] not in ignore: yield tokentup tokeniter = tokeniter() for type, tok, spos, epos, line in tokeniter: if expect_indent: if type != token.INDENT: # no suite -- one-line definition assert stack dtype, fullname, startline, _ = stack.pop() endline = epos[0] namespace.pop() result[fullname] = (dtype, startline, endline) expect_indent = False if tok in ('def', 'class'): name = next(tokeniter)[1] namespace.append(name) fullname = '.'.join(namespace) stack.append((tok, fullname, spos[0], indent)) defline = True elif type == token.INDENT: expect_indent = False indent += 1 elif type == token.DEDENT: indent -= 1 # if the stacklevel is the same as it was before the last # def/class block, this dedent closes that block if stack and indent == stack[-1][3]: dtype, fullname, startline, _ = stack.pop() endline = spos[0] namespace.pop() result[fullname] = (dtype, startline, endline) elif type == token.NEWLINE: # if this line contained a definition, expect an INDENT # to start the suite; if there is no such INDENT # it's a one-line definition if defline: defline = False expect_indent = True self.tags = result return result
class ModuleAnalyzer(object): # cache for analyzer objects -- caches both by module and file name cache = {} @classmethod def for_string(cls, string, modname, srcname='<string>'): if isinstance(string, bytes): return cls(BytesIO(string), modname, srcname) return cls(StringIO(string), modname, srcname, decoded=True) @classmethod def for_file(cls, filename, modname): if ('file', filename) in cls.cache: return cls.cache['file', filename] try: fileobj = open(filename, 'rb') except Exception as err: raise PycodeError('error opening %r' % filename, err) obj = cls(fileobj, modname, filename) cls.cache['file', filename] = obj return obj @classmethod def for_module(cls, modname): if ('module', modname) in cls.cache: entry = cls.cache['module', modname] if isinstance(entry, PycodeError): raise entry return entry try: type, source = get_module_source(modname) if type == 'string': obj = cls.for_string(source, modname) else: obj = cls.for_file(source, modname) except PycodeError as err: cls.cache['module', modname] = err raise cls.cache['module', modname] = obj return obj def __init__(self, source, modname, srcname, decoded=False): # name of the module self.modname = modname # name of the source file self.srcname = srcname # file-like object yielding source lines self.source = source # cache the source code as well pos = self.source.tell() if not decoded: self.encoding = detect_encoding(self.source.readline) self.source.seek(pos) self.code = self.source.read().decode(self.encoding) self.source.seek(pos) self.source = TextIOWrapper(self.source, self.encoding) else: self.encoding = None self.code = self.source.read() self.source.seek(pos) # will be filled by tokenize() self.tokens = None # will be filled by parse() self.parsetree = None # will be filled by find_attr_docs() self.attr_docs = None self.tagorder = None # will be filled by find_tags() self.tags = None def tokenize(self): """Generate tokens from the source.""" if self.tokens is not None: return try: self.tokens = list(tokenize.generate_tokens(self.source.readline)) except tokenize.TokenError as err: raise PycodeError('tokenizing failed', err) self.source.close() def parse(self): """Parse the generated source tokens.""" if self.parsetree is not None: return self.tokenize() try: self.parsetree = pydriver.parse_tokens(self.tokens) except parse.ParseError as err: raise PycodeError('parsing failed', err) def find_attr_docs(self, scope=''): """Find class and module-level attributes and their documentation.""" if self.attr_docs is not None: return self.attr_docs self.parse() attr_visitor = AttrDocVisitor(number2name, scope, self.encoding) attr_visitor.visit(self.parsetree) self.attr_docs = attr_visitor.collected self.tagorder = attr_visitor.tagorder # now that we found everything we could in the tree, throw it away # (it takes quite a bit of memory for large modules) self.parsetree = None return attr_visitor.collected def find_tags(self): """Find class, function and method definitions and their location.""" if self.tags is not None: return self.tags self.tokenize() result = {} namespace = [] stack = [] indent = 0 defline = False expect_indent = False emptylines = 0 def tokeniter(ignore = (token.COMMENT,)): for tokentup in self.tokens: if tokentup[0] not in ignore: yield tokentup tokeniter = tokeniter() for type, tok, spos, epos, line in tokeniter: if expect_indent: if type != token.INDENT: # no suite -- one-line definition assert stack dtype, fullname, startline, _ = stack.pop() endline = epos[0] namespace.pop() result[fullname] = (dtype, startline, endline - emptylines) expect_indent = False if tok in ('def', 'class'): name = next(tokeniter)[1] namespace.append(name) fullname = '.'.join(namespace) stack.append((tok, fullname, spos[0], indent)) defline = True elif type == token.INDENT: expect_indent = False indent += 1 elif type == token.DEDENT: indent -= 1 # if the stacklevel is the same as it was before the last # def/class block, this dedent closes that block if stack and indent == stack[-1][3]: dtype, fullname, startline, _ = stack.pop() endline = spos[0] namespace.pop() result[fullname] = (dtype, startline, endline - emptylines) elif type == token.NEWLINE: # if this line contained a definition, expect an INDENT # to start the suite; if there is no such INDENT # it's a one-line definition if defline: defline = False expect_indent = True emptylines = 0 elif type == token.NL: # count up if line is empty or comment only if emptyline_re.match(line): emptylines += 1 else: emptylines = 0 self.tags = result return result
def check(): # check for various conditions without bothering the network if len(uri) == 0 or uri[0] == '#' or \ uri[0:7] == 'mailto:' or uri[0:4] == 'ftp:': return 'unchecked', '', 0 elif not (uri[0:5] == 'http:' or uri[0:6] == 'https:'): return 'local', '', 0 elif uri in self.good: return 'working', 'old', 0 elif uri in self.broken: return 'broken', self.broken[uri], 0 elif uri in self.redirected: return 'redirected', self.redirected[uri][0], self.redirected[ uri][1] for rex in self.to_ignore: if rex.match(uri): return 'ignored', '', 0 # split off anchor if '#' in uri: req_url, hash = uri.split('#', 1) else: req_url = uri hash = None # handle non-ASCII URIs try: req_url.encode('ascii') except UnicodeError: split = urlsplit(req_url) req_url = ( split[0].encode() + '://' + # scheme split[1].encode('idna') + # netloc quote(split[2].encode('utf-8'))) # path if split[3]: # query req_url += '?' + quote(split[3].encode('utf-8')) # go back to Unicode strings which is required by Python 3 # (but now all parts are pure ascii) req_url = req_url.decode('ascii') # need to actually check the URI try: if hash and self.app.config.linkcheck_anchors: # Read the whole document and see if #hash exists req = Request(req_url) f = opener.open(req, **kwargs) encoding = 'utf-8' if hasattr(f.headers, 'get_content_charset'): encoding = f.headers.get_content_charset() or encoding found = check_anchor(TextIOWrapper(f, encoding), unquote(hash)) f.close() if not found: raise Exception("Anchor '%s' not found" % hash) else: try: # try a HEAD request, which should be easier on # the server and the network req = HeadRequest(req_url) f = opener.open(req, **kwargs) f.close() except HTTPError as err: if err.code != 405: raise # retry with GET if that fails, some servers # don't like HEAD requests and reply with 405 req = Request(req_url) f = opener.open(req, **kwargs) f.close() except HTTPError as err: if err.code == 401: # We'll take "Unauthorized" as working. self.good.add(uri) return 'working', ' - unauthorized', 0 else: self.broken[uri] = str(err) return 'broken', str(err), 0 except Exception as err: self.broken[uri] = str(err) return 'broken', str(err), 0 if f.url.rstrip('/') == req_url.rstrip('/'): self.good.add(uri) return 'working', '', 0 else: new_url = f.url if hash: new_url += '#' + hash code = getattr(req, 'redirect_code', 0) self.redirected[uri] = (new_url, code) return 'redirected', new_url, code
def check_uri(): # split off anchor if '#' in uri: req_url, anchor = uri.split('#', 1) else: req_url = uri anchor = None # handle non-ASCII URIs try: req_url.encode('ascii') except UnicodeError: req_url = encode_uri(req_url) try: if anchor and self.app.config.linkcheck_anchors and \ not anchor.startswith('!'): # Read the whole document and see if #anchor exists # (Anchors starting with ! are ignored since they are # commonly used for dynamic pages) req = Request(req_url) f = opener.open(req, **kwargs) encoding = 'utf-8' if hasattr(f.headers, 'get_content_charset'): encoding = f.headers.get_content_charset() or encoding else: encoding = get_content_charset(f) or encoding found = check_anchor(TextIOWrapper(f, encoding), unquote(anchor)) f.close() if not found: raise Exception("Anchor '%s' not found" % anchor) else: try: # try a HEAD request, which should be easier on # the server and the network req = HeadRequest(req_url) f = opener.open(req, **kwargs) f.close() except HTTPError as err: if err.code not in (403, 405): raise # retry with GET if that fails, some servers # don't like HEAD requests and reply with 403 or 405 req = Request(req_url) f = opener.open(req, **kwargs) f.close() except HTTPError as err: if err.code == 401: # We'll take "Unauthorized" as working. return 'working', ' - unauthorized', 0 else: return 'broken', str(err), 0 except Exception as err: return 'broken', str(err), 0 if f.url.rstrip('/') == req_url.rstrip('/'): return 'working', '', 0 else: new_url = f.url if anchor: new_url += '#' + anchor code = getattr(req, 'redirect_code', 0) return 'redirected', new_url, code