class PythonRobotParser(RobotParser): def __init__(self, robotstxt_body, spider): from six.moves.urllib_robotparser import RobotFileParser self.spider = spider try: robotstxt_body = to_native_str(robotstxt_body) except UnicodeDecodeError: # If we found garbage or robots.txt in an encoding other than UTF-8, disregard it. # Switch to 'allow all' state. logger.warning("Failure while parsing robots.txt using %(parser)s." " File either contains garbage or is in an encoding other than UTF-8, treating it as an empty file.", {'parser': "RobotFileParser"}, exc_info=sys.exc_info(), extra={'spider': self.spider}) robotstxt_body = '' self.rp = RobotFileParser() self.rp.parse(robotstxt_body.splitlines()) @classmethod def from_crawler(cls, crawler, robotstxt_body): spider = None if not crawler else crawler.spider o = cls(robotstxt_body, spider) return o def allowed(self, url, user_agent): user_agent = to_native_str(user_agent) url = to_native_str(url) return self.rp.can_fetch(user_agent, url)
def __init__(self, robotstxt_body, spider): from six.moves.urllib_robotparser import RobotFileParser self.spider = spider robotstxt_body = decode_robotstxt( robotstxt_body, spider, to_native_str_type=True) self.rp = RobotFileParser() self.rp.parse(robotstxt_body.splitlines())
def get_robotstxt_parser(url, session=None): """Get a RobotFileParser for the given robots.txt URL.""" rp = RobotFileParser() try: req = urlopen(url, session, max_content_bytes=MaxContentBytes, raise_for_status=False) except Exception: # connect or timeout errors are treated as an absent robots.txt rp.allow_all = True else: if req.status_code >= 400: rp.allow_all = True elif req.status_code == 200: rp.parse(req.text.splitlines()) return rp
def __init__(self, robotstxt_body, spider): from six.moves.urllib_robotparser import RobotFileParser self.spider = spider try: robotstxt_body = to_native_str(robotstxt_body) except UnicodeDecodeError: # If we found garbage or robots.txt in an encoding other than UTF-8, disregard it. # Switch to 'allow all' state. logger.warning("Failure while parsing robots.txt using %(parser)s." " File either contains garbage or is in an encoding other than UTF-8, treating it as an empty file.", {'parser': "RobotFileParser"}, exc_info=sys.exc_info(), extra={'spider': self.spider}) robotstxt_body = '' self.rp = RobotFileParser() self.rp.parse(robotstxt_body.splitlines())
class PythonRobotParser(RobotParser): def __init__(self, robotstxt_body, spider): from six.moves.urllib_robotparser import RobotFileParser self.spider = spider robotstxt_body = decode_robotstxt( robotstxt_body, spider, to_native_str_type=True) self.rp = RobotFileParser() self.rp.parse(robotstxt_body.splitlines()) @classmethod def from_crawler(cls, crawler, robotstxt_body): spider = None if not crawler else crawler.spider o = cls(robotstxt_body, spider) return o def allowed(self, url, user_agent): user_agent = to_native_str(user_agent) url = to_native_str(url) return self.rp.can_fetch(user_agent, url)
def get_robotstxt_parser(url, session=None): """Get a RobotFileParser for the given robots.txt URL.""" rp = RobotFileParser() try: req = urlopen(url, session, max_content_bytes=MaxContentBytes, allow_errors=range(600)) except Exception: # connect or timeout errors are treated as an absent robots.txt rp.allow_all = True else: if req.status_code >= 400: rp.allow_all = True elif req.status_code == 200: rp.parse(req.text.splitlines()) return rp
def __init__(self, user_agent, url): self.url = url self.user_agent = user_agent RobotFileParser.__init__(self, self.url)
def can_fetch(self, url, useragent=None): RobotFileParser.can_fetch(self, useragent=useragent or self.user_agent, url=url)