Beispiel #1
0
    def isDirectory(self, path):
        """Return whether the path is a directory.

        Assumes any path ending in a slash is a directory, and any that
        redirects to a location ending in a slash is also a directory.
        """
        if path.endswith("/"):
            return True

        # If the URI scheme is FTP, then the URI comes from a Squid
        # FTP listing page, which includes the trailing slash on all
        # URIs that need it.
        if self.scheme == 'ftp':
            return False

        self.log.debug("Checking if %s is a directory" % path)
        try:
            response = self.request("HEAD", path)
        except (TimeoutError, requests.RequestException) as exc:
            raise HTTPWalkerError(str(exc))

        if not response.is_redirect or "location" not in response.headers:
            return False
        url = response.headers["location"]
        scheme, netloc, redirect_path, _, _ = urlsplit(url, self.scheme,
                                                       self.FRAGMENTS)

        if len(scheme) and scheme != self.scheme:
            return False
        elif len(netloc) and netloc != self.full_netloc:
            return False
        elif redirect_path != as_dir(path):
            return False
        else:
            return True
Beispiel #2
0
    def __init__(self, base, log_parent=None):
        self.log = log.get_logger(type(self).__name__, log_parent)
        self.base = base

        (scheme, netloc, path, query, fragment) \
                 = urlsplit(base, self.URL_SCHEMES[0], self.FRAGMENTS)
        if scheme not in self.URL_SCHEMES:
            raise WalkerError("Can't handle %s scheme" % scheme)
        self.scheme = scheme
        self.full_netloc = netloc

        try:
            (user_passwd, host) = netloc.split("@", 1)
            self.host = unquote_plus(host)

            try:
                (user, passwd) = user_passwd.split(":", 1)
                self.user = unquote_plus(user)
                self.passwd = unquote_plus(passwd)
            except ValueError:
                self.user = unquote_plus(user_passwd)
                self.passwd = None
        except ValueError:
            self.host = unquote_plus(netloc)
            self.user = None
            self.passwd = None

        self.query = query
        self.fragment = fragment

        self.path = as_dir(path)
Beispiel #3
0
    def __init__(self, base, log_parent=None):
        self.log = log.get_logger(type(self).__name__, log_parent)
        self.base = base

        (scheme, netloc, path, query, fragment) \
                 = urlsplit(base, self.URL_SCHEMES[0], self.FRAGMENTS)
        if scheme not in self.URL_SCHEMES:
            raise WalkerError("Can't handle %s scheme" % scheme)
        self.scheme = scheme
        self.full_netloc = netloc

        try:
            (user_passwd, host) = netloc.split("@", 1)
            self.host = unquote_plus(host)

            try:
                (user, passwd) = user_passwd.split(":", 1)
                self.user = unquote_plus(user)
                self.passwd = unquote_plus(passwd)
            except ValueError:
                self.user = unquote_plus(user_passwd)
                self.passwd = None
        except ValueError:
            self.host = unquote_plus(netloc)
            self.user = None
            self.passwd = None

        self.query = query
        self.fragment = fragment

        self.path = as_dir(path)
Beispiel #4
0
    def list(self, dirname):
        """Download the HTML index at subdir and scrape for URLs.

        Returns a list of directory names (links ending with /, or
        that result in redirects to themselves ending in /) and
        filenames (everything else) that reside underneath the path.
        """
        self.log.info("Listing %s" % dirname)
        try:
            response = self.request("GET", dirname)
            try:
                soup = BeautifulSoup(response.read())
            finally:
                response.close()
        except (IOError, socket.error) as exc:
            raise HTTPWalkerError(str(exc))

        base = URI(self.base).resolve(dirname)

        # Collect set of URLs that are below the base URL
        urls = set()
        for anchor in soup("a"):
            href = anchor.get("href")
            if href is None:
                continue
            try:
                url = base.resolve(href)
            except InvalidURIError:
                continue
            # Only add the URL if it is strictly inside the base URL.
            if base.contains(url) and not url.contains(base):
                urls.add(url)

        dirnames = set()
        filenames = set()
        for url in urls:
            if url.path.endswith(';type=a') or url.path.endswith(';type=i'):
                # these links come from Squid's FTP dir listing to
                # force either ASCII or binary download and can be
                # ignored.
                continue

            filename = subdir(base.path, url.path)
            if self.isDirectory(url.path):
                dirnames.add(as_dir(filename))
            else:
                filenames.add(filename)

        return (sorted(dirnames), sorted(filenames))
Beispiel #5
0
    def list(self, dirname):
        """Download the HTML index at subdir and scrape for URLs.

        Returns a list of directory names (links ending with /, or
        that result in redirects to themselves ending in /) and
        filenames (everything else) that reside underneath the path.
        """
        self.log.info("Listing %s" % dirname)
        try:
            response = self.request("GET", dirname)
            try:
                soup = BeautifulSoup(response.read())
            finally:
                response.close()
        except (IOError, socket.error) as exc:
            raise HTTPWalkerError(str(exc))

        base = URI(self.base).resolve(dirname)

        # Collect set of URLs that are below the base URL
        urls = set()
        for anchor in soup("a"):
            href = anchor.get("href")
            if href is None:
                continue
            try:
                url = base.resolve(href)
            except InvalidURIError:
                continue
            # Only add the URL if it is strictly inside the base URL.
            if base.contains(url) and not url.contains(base):
                urls.add(url)

        dirnames = set()
        filenames = set()
        for url in urls:
            if url.path.endswith(';type=a') or url.path.endswith(';type=i'):
                # these links come from Squid's FTP dir listing to
                # force either ASCII or binary download and can be
                # ignored.
                continue

            filename = subdir(base.path, url.path)
            if self.isDirectory(url.path):
                dirnames.add(as_dir(filename))
            else:
                filenames.add(filename)

        return (sorted(dirnames), sorted(filenames))
Beispiel #6
0
    def isDirectory(self, path):
        """Return whether the path is a directory.

        Assumes any path ending in a slash is a directory, and any that
        redirects to a location ending in a slash is also a directory.
        """
        if path.endswith("/"):
            return True

        # If the URI scheme is FTP, then the URI comes from a Squid
        # FTP listing page, which includes the trailing slash on all
        # URIs that need it.
        if self.scheme == 'ftp':
            return False

        self.log.debug("Checking if %s is a directory" % path)
        try:
            self.request("HEAD", path)
            return False
        except urllib2.HTTPError as exc:
            if exc.code != 301:
                return False
        except (IOError, socket.error) as exc:
            # Raise HTTPWalkerError for other IO or socket errors.
            raise HTTPWalkerError(str(exc))

        # We have a 301 redirect error from here on.
        url = exc.hdrs.getheader("location")
        (scheme, netloc, redirect_path, query, fragment) \
                 = urlsplit(url, self.scheme, self.FRAGMENTS)

        if len(scheme) and scheme != self.scheme:
            return False
        elif len(netloc) and netloc != self.full_netloc:
            return False
        elif redirect_path != as_dir(path):
            return False
        else:
            return True
Beispiel #7
0
    def isDirectory(self, path):
        """Return whether the path is a directory.

        Assumes any path ending in a slash is a directory, and any that
        redirects to a location ending in a slash is also a directory.
        """
        if path.endswith("/"):
            return True

        # If the URI scheme is FTP, then the URI comes from a Squid
        # FTP listing page, which includes the trailing slash on all
        # URIs that need it.
        if self.scheme == 'ftp':
            return False

        self.log.debug("Checking if %s is a directory" % path)
        try:
            self.request("HEAD", path)
            return False
        except urllib2.HTTPError as exc:
            if exc.code != 301:
                return False
        except (IOError, socket.error) as exc:
            # Raise HTTPWalkerError for other IO or socket errors.
            raise HTTPWalkerError(str(exc))

        # We have a 301 redirect error from here on.
        url = exc.hdrs.getheader("location")
        (scheme, netloc, redirect_path, query, fragment) \
                 = urlsplit(url, self.scheme, self.FRAGMENTS)

        if len(scheme) and scheme != self.scheme:
            return False
        elif len(netloc) and netloc != self.full_netloc:
            return False
        elif redirect_path != as_dir(path):
            return False
        else:
            return True
Beispiel #8
0
    def walk(self):
        """Walk through the URL.

        Yields (dirpath, dirnames, filenames) for each path under the base;
        dirnames can be modified as with os.walk.
        """
        try:
            self.open()
        except (IOError, socket.error) as e:
            self.log.info("Could not connect to %s" % self.base)
            self.log.info("Failure: %s" % e)
            return

        subdirs = [self.path]
        while len(subdirs):
            sub_dir = subdirs.pop(0)

            try:
                (dirnames, filenames) = self.list(sub_dir)
            except WalkerError:
                self.log.info('could not retrieve directory '
                                   'listing for %s', sub_dir)
                continue
            except UnicodeEncodeError:
                # This page is unparsable.
                # XXX sinzui 2009-06-22 bug=70524:
                # This problem should be reported to the project drivers
                # so that they can attempt to get this fixed.
                self.log.info(
                    "Unicode error parsing %s page '%s'" %
                    (self.base, sub_dir))
                continue
            yield (sub_dir, dirnames, filenames)

            for dirname in dirnames:
                subdirs.append(urljoin(sub_dir, as_dir(dirname)))

        self.close()
Beispiel #9
0
    def walk(self):
        """Walk through the URL.

        Yields (dirpath, dirnames, filenames) for each path under the base;
        dirnames can be modified as with os.walk.
        """
        try:
            self.open()
        except (IOError, socket.error) as e:
            self.log.info("Could not connect to %s" % self.base)
            self.log.info("Failure: %s" % e)
            return

        subdirs = [self.path]
        while len(subdirs):
            sub_dir = subdirs.pop(0)

            try:
                (dirnames, filenames) = self.list(sub_dir)
            except WalkerError:
                self.log.info('could not retrieve directory '
                              'listing for %s', sub_dir)
                continue
            except UnicodeEncodeError:
                # This page is unparsable.
                # XXX sinzui 2009-06-22 bug=70524:
                # This problem should be reported to the project drivers
                # so that they can attempt to get this fixed.
                self.log.info("Unicode error parsing %s page '%s'" %
                              (self.base, sub_dir))
                continue
            yield (sub_dir, dirnames, filenames)

            for dirname in dirnames:
                subdirs.append(urljoin(sub_dir, as_dir(dirname)))

        self.close()