Esempio n. 1
0
    def __init__(self, source, modname, srcname, decoded=False):
        # name of the module
        self.modname = modname
        # name of the source file
        self.srcname = srcname
        # file-like object yielding source lines
        self.source = source

        # cache the source code as well
        pos = self.source.tell()
        if not decoded:
            self.encoding = detect_encoding(self.source.readline)
            self.source.seek(pos)
            self.code = self.source.read().decode(self.encoding)
            self.source.seek(pos)
            self.source = TextIOWrapper(self.source, self.encoding)
        else:
            self.encoding = None
            self.code = self.source.read()
            self.source.seek(pos)

        # will be filled by tokenize()
        self.tokens = None
        # will be filled by parse()
        self.parsetree = None
        # will be filled by find_attr_docs()
        self.attr_docs = None
        self.tagorder = None
        # will be filled by find_tags()
        self.tags = None
Esempio n. 2
0
    def __init__(self, source, modname, srcname, decoded=False):
        # name of the module
        self.modname = modname
        # name of the source file
        self.srcname = srcname
        # file-like object yielding source lines
        self.source = source

        # cache the source code as well
        pos = self.source.tell()
        if not decoded:
            self.encoding = detect_encoding(self.source.readline)
            self.source.seek(pos)
            self.code = self.source.read().decode(self.encoding)
            self.source.seek(pos)
            self.source = TextIOWrapper(self.source, self.encoding)
        else:
            self.encoding = None
            self.code = self.source.read()
            self.source.seek(pos)

        # will be filled by tokenize()
        self.tokens = None
        # will be filled by parse()
        self.parsetree = None
        # will be filled by find_attr_docs()
        self.attr_docs = None
        self.tagorder = None
        # will be filled by find_tags()
        self.tags = None
Esempio n. 3
0
        def check():
            # check for various conditions without bothering the network
            if len(uri) == 0 or uri[0] == '#' or \
               uri[0:7] == 'mailto:' or uri[0:4] == 'ftp:':
                return 'unchecked', '', 0
            elif not (uri[0:5] == 'http:' or uri[0:6] == 'https:'):
                return 'local', '', 0
            elif uri in self.good:
                return 'working', '', 0
            elif uri in self.broken:
                return 'broken', self.broken[uri], 0
            elif uri in self.redirected:
                return 'redirected', self.redirected[uri][0], self.redirected[
                    uri][1]
            for rex in self.to_ignore:
                if rex.match(uri):
                    return 'ignored', '', 0

            if '#' in uri:
                req_url, hash = uri.split('#', 1)
            else:
                req_url = uri
                hash = None

            # need to actually check the URI
            try:
                if hash and self.app.config.linkcheck_anchors:
                    # Read the whole document and see if #hash exists
                    req = Request(req_url)
                    f = opener.open(req, **kwargs)
                    encoding = 'utf-8'
                    if hasattr(f.headers, 'get_content_charset'):
                        encoding = f.headers.get_content_charset() or encoding
                    found = check_anchor(TextIOWrapper(f, encoding),
                                         unquote(hash))
                    f.close()

                    if not found:
                        raise Exception("Anchor '%s' not found" % hash)
                else:
                    try:
                        # try a HEAD request, which should be easier on
                        # the server and the network
                        req = HeadRequest(req_url)
                        f = opener.open(req, **kwargs)
                        f.close()
                    except HTTPError, err:
                        if err.code != 405:
                            raise
                        # retry with GET if that fails, some servers
                        # don't like HEAD requests and reply with 405
                        req = Request(req_url)
                        f = opener.open(req, **kwargs)
                        f.close()

            except Exception, err:
                self.broken[uri] = str(err)
                return 'broken', str(err), 0
Esempio n. 4
0
class ModuleAnalyzer(object):
    # cache for analyzer objects -- caches both by module and file name
    cache = {}

    @classmethod
    def for_string(cls, string, modname, srcname='<string>'):
        if isinstance(string, bytes):
            return cls(BytesIO(string), modname, srcname)
        return cls(StringIO(string), modname, srcname, decoded=True)

    @classmethod
    def for_file(cls, filename, modname):
        if ('file', filename) in cls.cache:
            return cls.cache['file', filename]
        try:
            fileobj = open(filename, 'rb')
        except Exception as err:
            raise PycodeError('error opening %r' % filename, err)
        obj = cls(fileobj, modname, filename)
        cls.cache['file', filename] = obj
        return obj

    @classmethod
    def for_module(cls, modname):
        if ('module', modname) in cls.cache:
            entry = cls.cache['module', modname]
            if isinstance(entry, PycodeError):
                raise entry
            return entry

        try:
            type, source = get_module_source(modname)
            if type == 'string':
                obj = cls.for_string(source, modname)
            else:
                obj = cls.for_file(source, modname)
        except PycodeError as err:
            cls.cache['module', modname] = err
            raise
        cls.cache['module', modname] = obj
        return obj

    def __init__(self, source, modname, srcname, decoded=False):
        # name of the module
        self.modname = modname
        # name of the source file
        self.srcname = srcname
        # file-like object yielding source lines
        self.source = source

        # cache the source code as well
        pos = self.source.tell()
        if not decoded:
            self.encoding = detect_encoding(self.source.readline)
            self.source.seek(pos)
            self.code = self.source.read().decode(self.encoding)
            self.source.seek(pos)
            self.source = TextIOWrapper(self.source, self.encoding)
        else:
            self.encoding = None
            self.code = self.source.read()
            self.source.seek(pos)

        # will be filled by tokenize()
        self.tokens = None
        # will be filled by parse()
        self.parsetree = None
        # will be filled by find_attr_docs()
        self.attr_docs = None
        self.tagorder = None
        # will be filled by find_tags()
        self.tags = None

    def tokenize(self):
        """Generate tokens from the source."""
        if self.tokens is not None:
            return
        try:
            self.tokens = list(tokenize.generate_tokens(self.source.readline))
        except tokenize.TokenError as err:
            raise PycodeError('tokenizing failed', err)
        self.source.close()

    def parse(self):
        """Parse the generated source tokens."""
        if self.parsetree is not None:
            return
        self.tokenize()
        try:
            self.parsetree = pydriver.parse_tokens(self.tokens)
        except parse.ParseError as err:
            raise PycodeError('parsing failed', err)

    def find_attr_docs(self, scope=''):
        """Find class and module-level attributes and their documentation."""
        if self.attr_docs is not None:
            return self.attr_docs
        self.parse()
        attr_visitor = AttrDocVisitor(number2name, scope, self.encoding)
        attr_visitor.visit(self.parsetree)
        self.attr_docs = attr_visitor.collected
        self.tagorder = attr_visitor.tagorder
        # now that we found everything we could in the tree, throw it away
        # (it takes quite a bit of memory for large modules)
        self.parsetree = None
        return attr_visitor.collected

    def find_tags(self):
        """Find class, function and method definitions and their location."""
        if self.tags is not None:
            return self.tags
        self.tokenize()
        result = {}
        namespace = []
        stack = []
        indent = 0
        defline = False
        expect_indent = False

        def tokeniter(ignore=(token.COMMENT, token.NL)):
            for tokentup in self.tokens:
                if tokentup[0] not in ignore:
                    yield tokentup

        tokeniter = tokeniter()
        for type, tok, spos, epos, line in tokeniter:
            if expect_indent:
                if type != token.INDENT:
                    # no suite -- one-line definition
                    assert stack
                    dtype, fullname, startline, _ = stack.pop()
                    endline = epos[0]
                    namespace.pop()
                    result[fullname] = (dtype, startline, endline)
                expect_indent = False
            if tok in ('def', 'class'):
                name = next(tokeniter)[1]
                namespace.append(name)
                fullname = '.'.join(namespace)
                stack.append((tok, fullname, spos[0], indent))
                defline = True
            elif type == token.INDENT:
                expect_indent = False
                indent += 1
            elif type == token.DEDENT:
                indent -= 1
                # if the stacklevel is the same as it was before the last
                # def/class block, this dedent closes that block
                if stack and indent == stack[-1][3]:
                    dtype, fullname, startline, _ = stack.pop()
                    endline = spos[0]
                    namespace.pop()
                    result[fullname] = (dtype, startline, endline)
            elif type == token.NEWLINE:
                # if this line contained a definition, expect an INDENT
                # to start the suite; if there is no such INDENT
                # it's a one-line definition
                if defline:
                    defline = False
                    expect_indent = True
        self.tags = result
        return result
Esempio n. 5
0
class ModuleAnalyzer(object):
    # cache for analyzer objects -- caches both by module and file name
    cache = {}

    @classmethod
    def for_string(cls, string, modname, srcname='<string>'):
        if isinstance(string, bytes):
            return cls(BytesIO(string), modname, srcname)
        return cls(StringIO(string), modname, srcname, decoded=True)

    @classmethod
    def for_file(cls, filename, modname):
        if ('file', filename) in cls.cache:
            return cls.cache['file', filename]
        try:
            fileobj = open(filename, 'rb')
        except Exception as err:
            raise PycodeError('error opening %r' % filename, err)
        obj = cls(fileobj, modname, filename)
        cls.cache['file', filename] = obj
        return obj

    @classmethod
    def for_module(cls, modname):
        if ('module', modname) in cls.cache:
            entry = cls.cache['module', modname]
            if isinstance(entry, PycodeError):
                raise entry
            return entry

        try:
            type, source = get_module_source(modname)
            if type == 'string':
                obj = cls.for_string(source, modname)
            else:
                obj = cls.for_file(source, modname)
        except PycodeError as err:
            cls.cache['module', modname] = err
            raise
        cls.cache['module', modname] = obj
        return obj

    def __init__(self, source, modname, srcname, decoded=False):
        # name of the module
        self.modname = modname
        # name of the source file
        self.srcname = srcname
        # file-like object yielding source lines
        self.source = source

        # cache the source code as well
        pos = self.source.tell()
        if not decoded:
            self.encoding = detect_encoding(self.source.readline)
            self.source.seek(pos)
            self.code = self.source.read().decode(self.encoding)
            self.source.seek(pos)
            self.source = TextIOWrapper(self.source, self.encoding)
        else:
            self.encoding = None
            self.code = self.source.read()
            self.source.seek(pos)

        # will be filled by tokenize()
        self.tokens = None
        # will be filled by parse()
        self.parsetree = None
        # will be filled by find_attr_docs()
        self.attr_docs = None
        self.tagorder = None
        # will be filled by find_tags()
        self.tags = None

    def tokenize(self):
        """Generate tokens from the source."""
        if self.tokens is not None:
            return
        try:
            self.tokens = list(tokenize.generate_tokens(self.source.readline))
        except tokenize.TokenError as err:
            raise PycodeError('tokenizing failed', err)
        self.source.close()

    def parse(self):
        """Parse the generated source tokens."""
        if self.parsetree is not None:
            return
        self.tokenize()
        try:
            self.parsetree = pydriver.parse_tokens(self.tokens)
        except parse.ParseError as err:
            raise PycodeError('parsing failed', err)

    def find_attr_docs(self, scope=''):
        """Find class and module-level attributes and their documentation."""
        if self.attr_docs is not None:
            return self.attr_docs
        self.parse()
        attr_visitor = AttrDocVisitor(number2name, scope, self.encoding)
        attr_visitor.visit(self.parsetree)
        self.attr_docs = attr_visitor.collected
        self.tagorder = attr_visitor.tagorder
        # now that we found everything we could in the tree, throw it away
        # (it takes quite a bit of memory for large modules)
        self.parsetree = None
        return attr_visitor.collected

    def find_tags(self):
        """Find class, function and method definitions and their location."""
        if self.tags is not None:
            return self.tags
        self.tokenize()
        result = {}
        namespace = []
        stack = []
        indent = 0
        defline = False
        expect_indent = False
        emptylines = 0

        def tokeniter(ignore = (token.COMMENT,)):
            for tokentup in self.tokens:
                if tokentup[0] not in ignore:
                    yield tokentup
        tokeniter = tokeniter()
        for type, tok, spos, epos, line in tokeniter:
            if expect_indent:
                if type != token.INDENT:
                    # no suite -- one-line definition
                    assert stack
                    dtype, fullname, startline, _ = stack.pop()
                    endline = epos[0]
                    namespace.pop()
                    result[fullname] = (dtype, startline, endline - emptylines)
                expect_indent = False
            if tok in ('def', 'class'):
                name = next(tokeniter)[1]
                namespace.append(name)
                fullname = '.'.join(namespace)
                stack.append((tok, fullname, spos[0], indent))
                defline = True
            elif type == token.INDENT:
                expect_indent = False
                indent += 1
            elif type == token.DEDENT:
                indent -= 1
                # if the stacklevel is the same as it was before the last
                # def/class block, this dedent closes that block
                if stack and indent == stack[-1][3]:
                    dtype, fullname, startline, _ = stack.pop()
                    endline = spos[0]
                    namespace.pop()
                    result[fullname] = (dtype, startline, endline - emptylines)
            elif type == token.NEWLINE:
                # if this line contained a definition, expect an INDENT
                # to start the suite; if there is no such INDENT
                # it's a one-line definition
                if defline:
                    defline = False
                    expect_indent = True
                emptylines = 0
            elif type == token.NL:
                # count up if line is empty or comment only
                if emptyline_re.match(line):
                    emptylines += 1
                else:
                    emptylines = 0
        self.tags = result
        return result
Esempio n. 6
0
        def check():
            # check for various conditions without bothering the network
            if len(uri) == 0 or uri[0] == '#' or \
               uri[0:7] == 'mailto:' or uri[0:4] == 'ftp:':
                return 'unchecked', '', 0
            elif not (uri[0:5] == 'http:' or uri[0:6] == 'https:'):
                return 'local', '', 0
            elif uri in self.good:
                return 'working', 'old', 0
            elif uri in self.broken:
                return 'broken', self.broken[uri], 0
            elif uri in self.redirected:
                return 'redirected', self.redirected[uri][0], self.redirected[
                    uri][1]
            for rex in self.to_ignore:
                if rex.match(uri):
                    return 'ignored', '', 0

            # split off anchor
            if '#' in uri:
                req_url, hash = uri.split('#', 1)
            else:
                req_url = uri
                hash = None

            # handle non-ASCII URIs
            try:
                req_url.encode('ascii')
            except UnicodeError:
                split = urlsplit(req_url)
                req_url = (
                    split[0].encode() + '://' +  # scheme
                    split[1].encode('idna') +  # netloc
                    quote(split[2].encode('utf-8')))  # path
                if split[3]:  # query
                    req_url += '?' + quote(split[3].encode('utf-8'))
                # go back to Unicode strings which is required by Python 3
                # (but now all parts are pure ascii)
                req_url = req_url.decode('ascii')

            # need to actually check the URI
            try:
                if hash and self.app.config.linkcheck_anchors:
                    # Read the whole document and see if #hash exists
                    req = Request(req_url)
                    f = opener.open(req, **kwargs)
                    encoding = 'utf-8'
                    if hasattr(f.headers, 'get_content_charset'):
                        encoding = f.headers.get_content_charset() or encoding
                    found = check_anchor(TextIOWrapper(f, encoding),
                                         unquote(hash))
                    f.close()

                    if not found:
                        raise Exception("Anchor '%s' not found" % hash)
                else:
                    try:
                        # try a HEAD request, which should be easier on
                        # the server and the network
                        req = HeadRequest(req_url)
                        f = opener.open(req, **kwargs)
                        f.close()
                    except HTTPError as err:
                        if err.code != 405:
                            raise
                        # retry with GET if that fails, some servers
                        # don't like HEAD requests and reply with 405
                        req = Request(req_url)
                        f = opener.open(req, **kwargs)
                        f.close()
            except HTTPError as err:
                if err.code == 401:
                    # We'll take "Unauthorized" as working.
                    self.good.add(uri)
                    return 'working', ' - unauthorized', 0
                else:
                    self.broken[uri] = str(err)
                    return 'broken', str(err), 0
            except Exception as err:
                self.broken[uri] = str(err)
                return 'broken', str(err), 0
            if f.url.rstrip('/') == req_url.rstrip('/'):
                self.good.add(uri)
                return 'working', '', 0
            else:
                new_url = f.url
                if hash:
                    new_url += '#' + hash
                code = getattr(req, 'redirect_code', 0)
                self.redirected[uri] = (new_url, code)
                return 'redirected', new_url, code
Esempio n. 7
0
        def check_uri():
            # split off anchor
            if '#' in uri:
                req_url, anchor = uri.split('#', 1)
            else:
                req_url = uri
                anchor = None

            # handle non-ASCII URIs
            try:
                req_url.encode('ascii')
            except UnicodeError:
                req_url = encode_uri(req_url)

            try:
                if anchor and self.app.config.linkcheck_anchors and \
                   not anchor.startswith('!'):
                    # Read the whole document and see if #anchor exists
                    # (Anchors starting with ! are ignored since they are
                    # commonly used for dynamic pages)
                    req = Request(req_url)
                    f = opener.open(req, **kwargs)
                    encoding = 'utf-8'
                    if hasattr(f.headers, 'get_content_charset'):
                        encoding = f.headers.get_content_charset() or encoding
                    else:
                        encoding = get_content_charset(f) or encoding
                    found = check_anchor(TextIOWrapper(f, encoding),
                                         unquote(anchor))
                    f.close()

                    if not found:
                        raise Exception("Anchor '%s' not found" % anchor)
                else:
                    try:
                        # try a HEAD request, which should be easier on
                        # the server and the network
                        req = HeadRequest(req_url)
                        f = opener.open(req, **kwargs)
                        f.close()
                    except HTTPError as err:
                        if err.code not in (403, 405):
                            raise
                        # retry with GET if that fails, some servers
                        # don't like HEAD requests and reply with 403 or 405
                        req = Request(req_url)
                        f = opener.open(req, **kwargs)
                        f.close()
            except HTTPError as err:
                if err.code == 401:
                    # We'll take "Unauthorized" as working.
                    return 'working', ' - unauthorized', 0
                else:
                    return 'broken', str(err), 0
            except Exception as err:
                return 'broken', str(err), 0
            if f.url.rstrip('/') == req_url.rstrip('/'):
                return 'working', '', 0
            else:
                new_url = f.url
                if anchor:
                    new_url += '#' + anchor
                code = getattr(req, 'redirect_code', 0)
                return 'redirected', new_url, code