Esempio n. 1
0
class PythonRobotParser(RobotParser):
    def __init__(self, robotstxt_body, spider):
        from six.moves.urllib_robotparser import RobotFileParser
        self.spider = spider
        try:
            robotstxt_body = to_native_str(robotstxt_body)
        except UnicodeDecodeError:
            # If we found garbage or robots.txt in an encoding other than UTF-8, disregard it.
            # Switch to 'allow all' state.
            logger.warning("Failure while parsing robots.txt using %(parser)s."
                           " File either contains garbage or is in an encoding other than UTF-8, treating it as an empty file.",
                           {'parser': "RobotFileParser"},
                           exc_info=sys.exc_info(),
                           extra={'spider': self.spider})
            robotstxt_body = ''
        self.rp = RobotFileParser()
        self.rp.parse(robotstxt_body.splitlines())

    @classmethod
    def from_crawler(cls, crawler, robotstxt_body):
        spider = None if not crawler else crawler.spider
        o = cls(robotstxt_body, spider)
        return o

    def allowed(self, url, user_agent):
        user_agent = to_native_str(user_agent)
        url = to_native_str(url)
        return self.rp.can_fetch(user_agent, url)
Esempio n. 2
0
 def __init__(self, robotstxt_body, spider):
     from six.moves.urllib_robotparser import RobotFileParser
     self.spider = spider
     robotstxt_body = decode_robotstxt(
         robotstxt_body, spider, to_native_str_type=True)
     self.rp = RobotFileParser()
     self.rp.parse(robotstxt_body.splitlines())
Esempio n. 3
0
def get_robotstxt_parser(url, session=None):
    """Get a RobotFileParser for the given robots.txt URL."""
    rp = RobotFileParser()
    try:
        req = urlopen(url, session, max_content_bytes=MaxContentBytes,
                      raise_for_status=False)
    except Exception:
        # connect or timeout errors are treated as an absent robots.txt
        rp.allow_all = True
    else:
        if req.status_code >= 400:
            rp.allow_all = True
        elif req.status_code == 200:
            rp.parse(req.text.splitlines())
    return rp
Esempio n. 4
0
 def __init__(self, robotstxt_body, spider):
     from six.moves.urllib_robotparser import RobotFileParser
     self.spider = spider
     try:
         robotstxt_body = to_native_str(robotstxt_body)
     except UnicodeDecodeError:
         # If we found garbage or robots.txt in an encoding other than UTF-8, disregard it.
         # Switch to 'allow all' state.
         logger.warning("Failure while parsing robots.txt using %(parser)s."
                        " File either contains garbage or is in an encoding other than UTF-8, treating it as an empty file.",
                        {'parser': "RobotFileParser"},
                        exc_info=sys.exc_info(),
                        extra={'spider': self.spider})
         robotstxt_body = ''
     self.rp = RobotFileParser()
     self.rp.parse(robotstxt_body.splitlines())
Esempio n. 5
0
class PythonRobotParser(RobotParser):
    def __init__(self, robotstxt_body, spider):
        from six.moves.urllib_robotparser import RobotFileParser
        self.spider = spider
        robotstxt_body = decode_robotstxt(
            robotstxt_body, spider, to_native_str_type=True)
        self.rp = RobotFileParser()
        self.rp.parse(robotstxt_body.splitlines())

    @classmethod
    def from_crawler(cls, crawler, robotstxt_body):
        spider = None if not crawler else crawler.spider
        o = cls(robotstxt_body, spider)
        return o

    def allowed(self, url, user_agent):
        user_agent = to_native_str(user_agent)
        url = to_native_str(url)
        return self.rp.can_fetch(user_agent, url)
Esempio n. 6
0
def get_robotstxt_parser(url, session=None):
    """Get a RobotFileParser for the given robots.txt URL."""
    rp = RobotFileParser()
    try:
        req = urlopen(url, session, max_content_bytes=MaxContentBytes,
                      allow_errors=range(600))
    except Exception:
        # connect or timeout errors are treated as an absent robots.txt
        rp.allow_all = True
    else:
        if req.status_code >= 400:
            rp.allow_all = True
        elif req.status_code == 200:
            rp.parse(req.text.splitlines())
    return rp
Esempio n. 7
0
 def __init__(self, user_agent, url):
     self.url = url
     self.user_agent = user_agent
     RobotFileParser.__init__(self, self.url)
Esempio n. 8
0
 def can_fetch(self, url, useragent=None):
     RobotFileParser.can_fetch(self,
                               useragent=useragent or self.user_agent,
                               url=url)