Esempio n. 1
0
    def test_parse_robots(self):
        try:
            import six.moves.urllib_robotparser as robotparser
        except ImportError as ex:
            raise unittest.SkipTest()

        rp = robotparser.RobotFileParser()
        rp.set_url("http://127.0.0.1:%s/robots.txt" % self.port)
        rp.read()
        assert not rp.can_fetch("*", "http://127.0.0.1:%s/" % self.port)
Esempio n. 2
0
    def test_parse_robots(self):
        try:
            import six.moves.urllib_robotparser as robotparser
        except ImportError as ex:
            raise unittest.SkipTest()

        rp = robotparser.RobotFileParser()
        with UrllibInterceptor(self.app, host='127.0.0.1', port=80) as url:
            rp.set_url("{}/robots.txt".format(url))
            rp.read()
            assert not rp.can_fetch("*", url)
Esempio n. 3
0
    def _assertAllowed(self, url):
        parsed = urlparse.urlparse(url)
        if self.restricted:
            # We are in restricted mode, check host part only
            host = parsed.netloc.partition(':')[0]
            if host in _allowed:
                return
            for dom in _allowed_2nd_level:
                if host.endswith('.%s' % dom):
                    return

            raise HostNotAllowed(url)
        else:
            # Unrestricted mode: retrieve robots.txt and check against it
            robotsurl = urlparse.urlunsplit(
                (parsed.scheme, parsed.netloc, '/robots.txt', '', ''))
            rp = urllib_robotparser.RobotFileParser()
            rp.set_url(robotsurl)
            rp.read()
            if not rp.can_fetch("*", url):
                msg = "request disallowed by robots.txt"
                raise RobotExclusionError(url, 403, msg, [], None)