def __getitem__(self, key): value = self._domain_metadata[key] for k, v in six.iteritems(value): if k in self._set_fields: value[k] = set(value[k]) if 'rp_url' in value and 'rp_body' in value: value['_rp'] = robotparser.RobotFileParser(value['rp_url']) value['_rp'].parse(value['rp_body'].splitlines()) return value
def _parse_robots(self, response): rp = robotparser.RobotFileParser(response.url) body = '' if hasattr(response, 'body_as_unicode'): body = response.body_as_unicode() else: # last effort try try: body = response.body.decode('utf-8') except UnicodeDecodeError: # If we found garbage, disregard it:, # but keep the lookup cached (in self._parsers) # Running rp.parse() will set rp state from # 'disallow all' to 'allow any'. pass rp.parse(body.splitlines()) self._parsers[urlparse_cached(response).netloc] = rp
def _process_robots_txt(self, response, domain): """Handle robots.txt successful response. The main logic behind the method is to create a RobotFileParser instance if it's possible to decode and read robots.txt content, and save it as a property of domain to reuse it later when deciding about need to schedule a domain page or not. """ netloc = response.meta[b'netloc'] domain.setdefault('queued_pages', 0) try: body = response.body.decode('utf-8') # TODO: use encoding from response.meta.get(b'encoding', 'utf-8') except UnicodeDecodeError: self.logger.warning("Error during robots.txt decoding at %s", response.url) update_domain_with_parser_data(domain, parser=None, url=response.url) self._schedule_home_page(netloc, domain) return robots_lines = body.splitlines() parser = robotparser.RobotFileParser(response.url) try: if not is_valid_robotstxt(robots_lines): raise SyntaxError("Robots.txt isn't valid") parser.parse(robots_lines) except Exception: self.logger.exception("Error during robots.txt parsing at %s", response.url) update_domain_with_parser_data(domain, parser=None, url=response.url) self._schedule_home_page(netloc, domain) return requests = set() for line in robots_lines: if line.startswith("Sitemap:"): _, _, url = line.partition(':') sitemap_url = urljoin(response.url, url.strip()) meta = {b'seed': domain.get('seed'), b'sitemap': True, b'scrapy_meta': {b'download_maxsize': SITEMAP_DOWNLOAD_MAXSIZE}} requests.add(self.create_request(sitemap_url, meta=meta, headers=DEFAULT_HEADERS)) self.refresh_states(requests) # schedule sitemap requests self._schedule_requests(requests, domain, score=0.9) if not requests: self.logger.debug("Sitemap in robots.txt wasn't found for url %s", response.url) update_domain_with_parser_data(domain, parser=parser, url=response.url, body=body) # also always schedule home page regardless of scheduled sitemaps self._schedule_home_page(netloc, domain)
def check_robotstxt(url, useCache, cache_dir, userAgent=None): scheme, netloc, url_path, query, fragment = urlparse.urlsplit(url) robotstxt_url = urlparse.urlunsplit(( scheme, netloc, '/robots.txt', '', '', )) key = generate_key(robotstxt_url) robots_parser = robotparser.RobotFileParser() cached_content = cache_get(cache_dir, key) if useCache else '' threshold = (time.time() - 86400 * 7) if not cached_content or cache_info(cache_dir, key) < threshold: try: cached_content = fetch(robotstxt_url, userAgent=userAgent) if useCache: cache_set(cache_dir, key, cached_content) except HTTPError as he: # this block mimics the behaviour in the robotparser.read() method if he.code in (401, 403): robots_parser.disallow_all = True elif he.code >= 400: robots_parser.allow_all = True else: raise he cached_content = '' try: cached_content = str(cached_content, encoding='utf8') except TypeError: pass robots_parser.parse((x for x in cached_content.split('\n'))) default_useragent = None for k, v in OpenerDirector().addheaders: if k == "User-agent": default_useragent = v break return robots_parser.can_fetch(userAgent or default_useragent, url)
def _parse_robots(self, response, netloc): rp = robotparser.RobotFileParser(response.url) body = '' if hasattr(response, 'text'): body = response.text else: # last effort try try: body = response.body.decode('utf-8') except UnicodeDecodeError: # If we found garbage, disregard it:, # but keep the lookup cached (in self._parsers) # Running rp.parse() will set rp state from # 'disallow all' to 'allow any'. pass # stdlib's robotparser expects native 'str' ; # with unicode input, non-ASCII encoded bytes decoding fails in Python2 rp.parse(to_native_str(body).splitlines()) rp_dfd = self._parsers[netloc] self._parsers[netloc] = rp rp_dfd.callback(rp)
def _parse_robots(self, response): rp = robotparser.RobotFileParser(response.url) rp.parse(response.body.splitlines()) self._parsers[urlparse_cached(response).netloc] = rp