def response_done(trailers): if not exchange.status.startswith("2"): robots_txt = "" else: robots_txt = exchange.res_body self.robot_files[origin] = robots_txt if self.robot_cache_dir: robot_fd = CacheFile( path.join(self.robot_cache_dir, origin_hash)) robot_fd.write(robots_txt, 60*30) for _cb in self.robot_lookups[origin]: _cb(robots_txt) del self.robot_lookups[origin]
def response_done(trailers: RawHeaderListType) -> None: if not exchange.status.startswith(b"2"): robots_txt = b"" else: robots_txt = exchange.res_body self._load_checker(origin, robots_txt) if self.robot_cache_dir: robot_fd = CacheFile(path.join(self.robot_cache_dir, origin_hash)) robot_fd.write(robots_txt, self.freshness_lifetime) while True: try: check_url = self.robot_lookups[origin].pop() except KeyError: break self._robot_check(check_url, self.robot_checkers[origin]) del self.robot_lookups[origin]
def response_done(trailers: RawHeaderListType) -> None: if not exchange.status.startswith(b"2"): robots_txt = b"" else: robots_txt = exchange.res_body self._load_checker(origin, robots_txt) if self.robot_cache_dir: robot_fd = CacheFile( path.join(self.robot_cache_dir, origin_hash)) robot_fd.write(robots_txt, self.freshness_lifetime) while True: try: check_url = self.robot_lookups[origin].pop() except KeyError: break self._robot_check(check_url, self.robot_checkers[origin]) try: del self.robot_lookups[origin] except KeyError: pass
def fetch_robots_txt(self, url, cb, network=True): """ Fetch the robots.txt URL and then feed the response to cb. If the status code is not 200, send a blank doc back. If network is False, we won't use the network, will return the result immediately if cached, and will assume it's OK if we don't have a cached file. """ origin = url_to_origin(self.request.uri) if origin == None: cb("") return "" origin_hash = hashlib.sha1(origin).hexdigest() if self.robot_files.has_key(origin): # FIXME: freshness lifetime cb(self.robot_files[origin]) return self.robot_files[origin] if self.robot_cache_dir: robot_fd = CacheFile(path.join(self.robot_cache_dir, origin_hash)) cached_robots_txt = robot_fd.read() if cached_robots_txt != None: cb(cached_robots_txt) return cached_robots_txt if not network: cb("") return "" if self.robot_lookups.has_key(origin): self.robot_lookups[origin].append(cb) else: self.robot_lookups[origin] = [cb] exchange = self.client.exchange() @thor.on(exchange) def response_start(status, phrase, headers): exchange.status = status exchange.res_body = "" @thor.on(exchange) def response_body(chunk): exchange.res_body += chunk @thor.on(exchange) def response_done(trailers): if not exchange.status.startswith("2"): robots_txt = "" else: robots_txt = exchange.res_body self.robot_files[origin] = robots_txt if self.robot_cache_dir: robot_fd = CacheFile( path.join(self.robot_cache_dir, origin_hash)) robot_fd.write(robots_txt, 60*30) for _cb in self.robot_lookups[origin]: _cb(robots_txt) del self.robot_lookups[origin] p_url = urlsplit(url) robots_url = "%s://%s/robots.txt" % (p_url.scheme, p_url.netloc) exchange.request_start("GET", robots_url, [('User-Agent', UA_STRING)]) exchange.request_done([])
def check_robots(self, url: str, sync: bool=False) -> Union[bool, None]: """ Fetch the robots.txt for URL. When sync is true, the result is returned. Sync does not go to network; if there is not a local (memory or cache) robots.txt, it will return True. When it's false, the "robot" event will be emitted, with two arguments: - the URL - True if it's allowed, False if not """ origin = url_to_origin(url) if origin is None: if sync: return True else: self.emit("robot-%s" % url, True) return None origin_hash = hashlib.sha1(origin.encode('ascii', 'replace')).hexdigest() if origin in self.robot_checkers: return self._robot_check(url, self.robot_checkers[origin], sync) if self.robot_cache_dir: robot_fd = CacheFile(path.join(self.robot_cache_dir, origin_hash)) cached_robots_txt = robot_fd.read() if cached_robots_txt != None: self._load_checker(origin, cached_robots_txt) return self._robot_check(url, self.robot_checkers[origin], sync) if sync: return True if origin in self.robot_lookups: self.robot_lookups[origin].add(url) else: self.robot_lookups[origin] = set([url]) exchange = self.client.exchange() @thor.on(exchange) def response_start(status: bytes, phrase: bytes, headers: RawHeaderListType) -> None: exchange.status = status exchange.res_body = b"" @thor.on(exchange) def response_body(chunk: bytes) -> None: exchange.res_body += chunk @thor.on(exchange) def response_done(trailers: RawHeaderListType) -> None: if not exchange.status.startswith(b"2"): robots_txt = b"" else: robots_txt = exchange.res_body self._load_checker(origin, robots_txt) if self.robot_cache_dir: robot_fd = CacheFile(path.join(self.robot_cache_dir, origin_hash)) robot_fd.write(robots_txt, self.freshness_lifetime) while True: try: check_url = self.robot_lookups[origin].pop() except KeyError: break self._robot_check(check_url, self.robot_checkers[origin]) del self.robot_lookups[origin] @thor.on(exchange) def error(error: thor.http.error.HttpError) -> None: exchange.status = b"500" response_done([]) p_url = urlsplit(url) robots_url = "%s://%s/robots.txt" % (p_url.scheme, p_url.netloc) exchange.request_start(b"GET", robots_url.encode('ascii'), [(b'User-Agent', UA_STRING.encode('ascii'))]) exchange.request_done([])
def check_robots(self, url: str, sync: bool = False) -> Union[bool, None]: """ Fetch the robots.txt for URL. When sync is true, the result is returned. Sync does not go to network; if there is not a local (memory or cache) robots.txt, it will return True. When it's false, the "robot" event will be emitted, with two arguments: - the URL - True if it's allowed, False if not """ origin = url_to_origin(url) if origin is None: if sync: return True else: self.emit("robot-%s" % url, True) return None origin_hash = hashlib.sha1(origin.encode('ascii', 'replace')).hexdigest() if origin in self.robot_checkers: return self._robot_check(url, self.robot_checkers[origin], sync) if self.robot_cache_dir: robot_fd = CacheFile(path.join(self.robot_cache_dir, origin_hash)) cached_robots_txt = robot_fd.read() if cached_robots_txt != None: self._load_checker(origin, cached_robots_txt) return self._robot_check(url, self.robot_checkers[origin], sync) if sync: return True if origin in self.robot_lookups: self.robot_lookups[origin].add(url) else: self.robot_lookups[origin] = set([url]) exchange = self.client.exchange() @thor.on(exchange) def response_start(status: bytes, phrase: bytes, headers: RawHeaderListType) -> None: exchange.status = status exchange.res_body = b"" @thor.on(exchange) def response_body(chunk: bytes) -> None: exchange.res_body += chunk @thor.on(exchange) def response_done(trailers: RawHeaderListType) -> None: if not exchange.status.startswith(b"2"): robots_txt = b"" else: robots_txt = exchange.res_body self._load_checker(origin, robots_txt) if self.robot_cache_dir: robot_fd = CacheFile( path.join(self.robot_cache_dir, origin_hash)) robot_fd.write(robots_txt, self.freshness_lifetime) while True: try: check_url = self.robot_lookups[origin].pop() except KeyError: break self._robot_check(check_url, self.robot_checkers[origin]) try: del self.robot_lookups[origin] except KeyError: pass @thor.on(exchange) def error(error: thor.http.error.HttpError) -> None: exchange.status = b"500" response_done([]) p_url = urlsplit(url) robots_url = "%s://%s/robots.txt" % (p_url.scheme, p_url.netloc) exchange.request_start( b"GET", robots_url.encode('ascii'), [(b'User-Agent', UA_STRING.encode('ascii'))]) exchange.request_done([]) return None
def check_robots(self, url: str) -> None: """ Fetch the robots.txt for URL. The 'robot' event will be emitted, with a (url, robot_ok) payload. """ origin = url_to_origin(url) if origin is None: self.emit("robot", (url, True)) return None origin_hash = hashlib.sha1(origin.encode("ascii", "replace")).hexdigest() if origin in self.robot_checkers: return self._robot_check(url, self.robot_checkers[origin]) if self.config.get("robot_cache_dir", ""): robot_fd = CacheFile(path.join(self.config["robot_cache_dir"], origin_hash)) cached_robots_txt = robot_fd.read() if cached_robots_txt is not None: self._load_checker(origin, cached_robots_txt) return self._robot_check(url, self.robot_checkers[origin]) if origin in self.robot_lookups: self.robot_lookups[origin].add(url) else: self.robot_lookups[origin] = set([url]) exchange = self.client.exchange() @thor.on(exchange) def response_start( status: bytes, phrase: bytes, headers: RawHeaderListType ) -> None: exchange.status = status exchange.res_body = b"" @thor.on(exchange) def response_body(chunk: bytes) -> None: exchange.res_body += chunk @thor.on(exchange) def response_done(trailers: RawHeaderListType) -> None: if not exchange.status.startswith(b"2"): robots_txt = b"" else: robots_txt = exchange.res_body self._load_checker(origin, robots_txt) if self.config.get("robot_cache_dir", ""): robot_fd = CacheFile( path.join(self.config["robot_cache_dir"], origin_hash) ) robot_fd.write(robots_txt, self.freshness_lifetime) while True: try: check_url = self.robot_lookups[origin].pop() except KeyError: break self._robot_check(check_url, self.robot_checkers[origin]) try: del self.robot_lookups[origin] except KeyError: pass @thor.on(exchange) def error(error: thor.http.error.HttpError) -> None: exchange.status = b"500" response_done([]) p_url = urlsplit(url) robots_url = "%s://%s/robots.txt" % (p_url.scheme, p_url.netloc) exchange.request_start( b"GET", robots_url.encode("ascii"), [(b"User-Agent", UA_STRING.encode("ascii"))], ) exchange.request_done([]) return None