コード例 #1
0
ファイル: fetch.py プロジェクト: ElijahLynn/redbot
            def response_done(trailers):
                if not exchange.status.startswith("2"):
                    robots_txt = ""
                else:
                    robots_txt = exchange.res_body

                self.robot_files[origin] = robots_txt
                if self.robot_cache_dir:
                    robot_fd = CacheFile(
                        path.join(self.robot_cache_dir, origin_hash))
                    robot_fd.write(robots_txt, 60*30)

                for _cb in self.robot_lookups[origin]:
                    _cb(robots_txt)
                del self.robot_lookups[origin]
コード例 #2
0
            def response_done(trailers: RawHeaderListType) -> None:
                if not exchange.status.startswith(b"2"):
                    robots_txt = b""
                else:
                    robots_txt = exchange.res_body

                self._load_checker(origin, robots_txt)
                if self.robot_cache_dir:
                    robot_fd = CacheFile(path.join(self.robot_cache_dir, origin_hash))
                    robot_fd.write(robots_txt, self.freshness_lifetime)

                while True:
                    try:
                        check_url = self.robot_lookups[origin].pop()
                    except KeyError:
                        break
                    self._robot_check(check_url, self.robot_checkers[origin])
                del self.robot_lookups[origin]
コード例 #3
0
            def response_done(trailers: RawHeaderListType) -> None:
                if not exchange.status.startswith(b"2"):
                    robots_txt = b""
                else:
                    robots_txt = exchange.res_body

                self._load_checker(origin, robots_txt)
                if self.robot_cache_dir:
                    robot_fd = CacheFile(
                        path.join(self.robot_cache_dir, origin_hash))
                    robot_fd.write(robots_txt, self.freshness_lifetime)

                while True:
                    try:
                        check_url = self.robot_lookups[origin].pop()
                    except KeyError:
                        break
                    self._robot_check(check_url, self.robot_checkers[origin])
                try:
                    del self.robot_lookups[origin]
                except KeyError:
                    pass
コード例 #4
0
ファイル: fetch.py プロジェクト: ElijahLynn/redbot
    def fetch_robots_txt(self, url, cb, network=True):
        """
        Fetch the robots.txt URL and then feed the response to cb.
        If the status code is not 200, send a blank doc back.

        If network is False, we won't use the network, will return the result
        immediately if cached, and will assume it's OK if we don't have a
        cached file.
        """

        origin = url_to_origin(self.request.uri)
        if origin == None:
            cb("")
            return ""
        origin_hash = hashlib.sha1(origin).hexdigest()

        if self.robot_files.has_key(origin):
            # FIXME: freshness lifetime
            cb(self.robot_files[origin])
            return self.robot_files[origin]

        if self.robot_cache_dir:
            robot_fd = CacheFile(path.join(self.robot_cache_dir, origin_hash))
            cached_robots_txt = robot_fd.read()
            if cached_robots_txt != None:
                cb(cached_robots_txt)
                return cached_robots_txt

        if not network:
            cb("")
            return ""

        if self.robot_lookups.has_key(origin):
            self.robot_lookups[origin].append(cb)
        else:
            self.robot_lookups[origin] = [cb]
            exchange = self.client.exchange()
            @thor.on(exchange)
            def response_start(status, phrase, headers):
                exchange.status = status

            exchange.res_body = ""
            @thor.on(exchange)
            def response_body(chunk):
                exchange.res_body += chunk

            @thor.on(exchange)
            def response_done(trailers):
                if not exchange.status.startswith("2"):
                    robots_txt = ""
                else:
                    robots_txt = exchange.res_body

                self.robot_files[origin] = robots_txt
                if self.robot_cache_dir:
                    robot_fd = CacheFile(
                        path.join(self.robot_cache_dir, origin_hash))
                    robot_fd.write(robots_txt, 60*30)

                for _cb in self.robot_lookups[origin]:
                    _cb(robots_txt)
                del self.robot_lookups[origin]

            p_url = urlsplit(url)
            robots_url = "%s://%s/robots.txt" % (p_url.scheme, p_url.netloc)
            exchange.request_start("GET", robots_url,
                [('User-Agent', UA_STRING)])
            exchange.request_done([])
コード例 #5
0
    def check_robots(self, url: str, sync: bool=False) -> Union[bool, None]:
        """
        Fetch the robots.txt for URL.

        When sync is true, the result is returned. Sync does not go to network; if
        there is not a local (memory or cache) robots.txt, it will return True.

        When it's false, the "robot" event will be emitted, with two arguments:
          - the URL
          - True if it's allowed, False if not
        """

        origin = url_to_origin(url)
        if origin is None:
            if sync:
                return True
            else:
                self.emit("robot-%s" % url, True)
                return None
        origin_hash = hashlib.sha1(origin.encode('ascii', 'replace')).hexdigest()

        if origin in self.robot_checkers:
            return self._robot_check(url, self.robot_checkers[origin], sync)

        if self.robot_cache_dir:
            robot_fd = CacheFile(path.join(self.robot_cache_dir, origin_hash))
            cached_robots_txt = robot_fd.read()
            if cached_robots_txt != None:
                self._load_checker(origin, cached_robots_txt)
                return self._robot_check(url, self.robot_checkers[origin], sync)

        if sync:
            return True

        if origin in self.robot_lookups:
            self.robot_lookups[origin].add(url)
        else:
            self.robot_lookups[origin] = set([url])
            exchange = self.client.exchange()
            @thor.on(exchange)
            def response_start(status: bytes, phrase: bytes, headers: RawHeaderListType) -> None:
                exchange.status = status

            exchange.res_body = b""
            @thor.on(exchange)
            def response_body(chunk: bytes) -> None:
                exchange.res_body += chunk

            @thor.on(exchange)
            def response_done(trailers: RawHeaderListType) -> None:
                if not exchange.status.startswith(b"2"):
                    robots_txt = b""
                else:
                    robots_txt = exchange.res_body

                self._load_checker(origin, robots_txt)
                if self.robot_cache_dir:
                    robot_fd = CacheFile(path.join(self.robot_cache_dir, origin_hash))
                    robot_fd.write(robots_txt, self.freshness_lifetime)

                while True:
                    try:
                        check_url = self.robot_lookups[origin].pop()
                    except KeyError:
                        break
                    self._robot_check(check_url, self.robot_checkers[origin])
                del self.robot_lookups[origin]

            @thor.on(exchange)
            def error(error: thor.http.error.HttpError) -> None:
                exchange.status = b"500"
                response_done([])

            p_url = urlsplit(url)
            robots_url = "%s://%s/robots.txt" % (p_url.scheme, p_url.netloc)
            exchange.request_start(b"GET", robots_url.encode('ascii'),
                                   [(b'User-Agent', UA_STRING.encode('ascii'))])
            exchange.request_done([])
コード例 #6
0
    def check_robots(self, url: str, sync: bool = False) -> Union[bool, None]:
        """
        Fetch the robots.txt for URL.

        When sync is true, the result is returned. Sync does not go to network; if
        there is not a local (memory or cache) robots.txt, it will return True.

        When it's false, the "robot" event will be emitted, with two arguments:
          - the URL
          - True if it's allowed, False if not
        """

        origin = url_to_origin(url)
        if origin is None:
            if sync:
                return True
            else:
                self.emit("robot-%s" % url, True)
                return None
        origin_hash = hashlib.sha1(origin.encode('ascii',
                                                 'replace')).hexdigest()

        if origin in self.robot_checkers:
            return self._robot_check(url, self.robot_checkers[origin], sync)

        if self.robot_cache_dir:
            robot_fd = CacheFile(path.join(self.robot_cache_dir, origin_hash))
            cached_robots_txt = robot_fd.read()
            if cached_robots_txt != None:
                self._load_checker(origin, cached_robots_txt)
                return self._robot_check(url, self.robot_checkers[origin],
                                         sync)

        if sync:
            return True

        if origin in self.robot_lookups:
            self.robot_lookups[origin].add(url)
        else:
            self.robot_lookups[origin] = set([url])
            exchange = self.client.exchange()

            @thor.on(exchange)
            def response_start(status: bytes, phrase: bytes,
                               headers: RawHeaderListType) -> None:
                exchange.status = status

            exchange.res_body = b""

            @thor.on(exchange)
            def response_body(chunk: bytes) -> None:
                exchange.res_body += chunk

            @thor.on(exchange)
            def response_done(trailers: RawHeaderListType) -> None:
                if not exchange.status.startswith(b"2"):
                    robots_txt = b""
                else:
                    robots_txt = exchange.res_body

                self._load_checker(origin, robots_txt)
                if self.robot_cache_dir:
                    robot_fd = CacheFile(
                        path.join(self.robot_cache_dir, origin_hash))
                    robot_fd.write(robots_txt, self.freshness_lifetime)

                while True:
                    try:
                        check_url = self.robot_lookups[origin].pop()
                    except KeyError:
                        break
                    self._robot_check(check_url, self.robot_checkers[origin])
                try:
                    del self.robot_lookups[origin]
                except KeyError:
                    pass

            @thor.on(exchange)
            def error(error: thor.http.error.HttpError) -> None:
                exchange.status = b"500"
                response_done([])

            p_url = urlsplit(url)
            robots_url = "%s://%s/robots.txt" % (p_url.scheme, p_url.netloc)
            exchange.request_start(
                b"GET", robots_url.encode('ascii'),
                [(b'User-Agent', UA_STRING.encode('ascii'))])
            exchange.request_done([])
        return None
コード例 #7
0
ファイル: robot_fetch.py プロジェクト: kom0055/redbot
    def check_robots(self, url: str) -> None:
        """
        Fetch the robots.txt for URL.

        The 'robot' event will be emitted, with a (url, robot_ok) payload.
        """

        origin = url_to_origin(url)
        if origin is None:
            self.emit("robot", (url, True))
            return None
        origin_hash = hashlib.sha1(origin.encode("ascii", "replace")).hexdigest()

        if origin in self.robot_checkers:
            return self._robot_check(url, self.robot_checkers[origin])

        if self.config.get("robot_cache_dir", ""):
            robot_fd = CacheFile(path.join(self.config["robot_cache_dir"], origin_hash))
            cached_robots_txt = robot_fd.read()
            if cached_robots_txt is not None:
                self._load_checker(origin, cached_robots_txt)
                return self._robot_check(url, self.robot_checkers[origin])

        if origin in self.robot_lookups:
            self.robot_lookups[origin].add(url)
        else:
            self.robot_lookups[origin] = set([url])
            exchange = self.client.exchange()

            @thor.on(exchange)
            def response_start(
                status: bytes, phrase: bytes, headers: RawHeaderListType
            ) -> None:
                exchange.status = status

            exchange.res_body = b""

            @thor.on(exchange)
            def response_body(chunk: bytes) -> None:
                exchange.res_body += chunk

            @thor.on(exchange)
            def response_done(trailers: RawHeaderListType) -> None:
                if not exchange.status.startswith(b"2"):
                    robots_txt = b""
                else:
                    robots_txt = exchange.res_body

                self._load_checker(origin, robots_txt)
                if self.config.get("robot_cache_dir", ""):
                    robot_fd = CacheFile(
                        path.join(self.config["robot_cache_dir"], origin_hash)
                    )
                    robot_fd.write(robots_txt, self.freshness_lifetime)

                while True:
                    try:
                        check_url = self.robot_lookups[origin].pop()
                    except KeyError:
                        break
                    self._robot_check(check_url, self.robot_checkers[origin])
                try:
                    del self.robot_lookups[origin]
                except KeyError:
                    pass

            @thor.on(exchange)
            def error(error: thor.http.error.HttpError) -> None:
                exchange.status = b"500"
                response_done([])

            p_url = urlsplit(url)
            robots_url = "%s://%s/robots.txt" % (p_url.scheme, p_url.netloc)
            exchange.request_start(
                b"GET",
                robots_url.encode("ascii"),
                [(b"User-Agent", UA_STRING.encode("ascii"))],
            )
            exchange.request_done([])
        return None