Esempio n. 1
0
    def allowed_url(self):
        #FIXME: Should use the geturl address as it may have been redirected
        scheme, netloc, path, query, fragment = urlsplit(self.url)
        robot_url = urlunsplit([scheme, netloc, "robots.txt", "", ""])

        #FIXME: Should cache robots.txt in a better persistent data structure
        if robot_url in ROBOT_CACHE:
            rp = ROBOT_CACHE[robot_url]
        else:
            rp = RobotExclusionRulesParser()
            try:
                rp.fetch(robot_url)
            # Currently if there's a problem we assume there is no robots.txt
            except IOError:
                # Should be catching the urllib2.URLError exception
                logging.debug("Couldn't retrieve robots.txt for %s" %
                              robot_url)
                rp = None
            except UnicodeDecodeError:
                logging.debug("Unicode decode error for robots.txt at %s" %
                              robot_url)
                rp = None
            except httplib.HTTPException:
                logging.debug("Generic HTTPException for robots.txt at %s" %
                              robot_url)
                rp = None
            ROBOT_CACHE[robot_url] = rp

        if rp is None or rp.is_allowed("*", self.url):
            base_url = urlunsplit([scheme, netloc, "", "", ""])

            # If there's a current delay on the site respect robots.txt and stall
            if self.db.exists(netloc):
                logging.debug("Obeying robot overlord for %s..." % netloc)
                URLHandler.add_to_busy(self.db, self.url)
                return False

            # Set a delay for any other requests to this site to respect robots.txt
            delay = rp.get_crawl_delay("*") if rp else None
            if delay:
                delay = int(math.ceil(float(rp.get_crawl_delay("*"))))
            else:
                delay = SETTINGS["DEFAULT_ROBOTS_DELAY"]
            self.db.setex(netloc, "1", delay)

            return True
        else:
            return False
Esempio n. 2
0
class RerpWrapper(python_common.web.robots_txt.parser_base.RobotsTxtParser):
    def __init__(self, content=None, expires=None):
        super(RerpWrapper, self).__init__(content, expires)
        if content:
            self.parser = RobotExclusionRulesParser()
            self.parser.use_local_time = False
            self.parser.expiration_date = self.expires
            self.parser.parse(content)
        else:
            self.parser = None
            self.my_super = super(RerpWrapper, self)

    def allowed(self, user_agent, url):
        return self.parser.is_allowed(
            user_agent, url) if self.parser else self.my_super.allowed(
                user_agent, url)

    def delay(self, user_agent):
        return self.parser.get_crawl_delay(
            user_agent) if self.parser else self.my_super.delay(user_agent)

    @property
    def expired(self):
        return self.parser.is_expired if self.parser else self.my_super.expired

    @property
    def sitemaps(self):
        return self.parser.sitemaps if self.parser else self.my_super.sitemaps
Esempio n. 3
0
  def allowed_url(self):
    #FIXME: Should use the geturl address as it may have been redirected
    scheme, netloc, path, query, fragment = urlsplit(self.url)
    robot_url = urlunsplit([scheme, netloc, "robots.txt", "", ""])

    #FIXME: Should cache robots.txt in a better persistent data structure
    if robot_url in ROBOT_CACHE:
      rp = ROBOT_CACHE[robot_url]
    else:
      rp = RobotExclusionRulesParser()
      try:
        rp.fetch(robot_url)
      # Currently if there's a problem we assume there is no robots.txt
      except IOError:
        # Should be catching the urllib2.URLError exception
        logging.debug("Couldn't retrieve robots.txt for %s" % robot_url)
        rp = None
      except UnicodeDecodeError:
        logging.debug("Unicode decode error for robots.txt at %s" % robot_url)
        rp = None
      except httplib.HTTPException:
        logging.debug("Generic HTTPException for robots.txt at %s" % robot_url)
        rp = None
      ROBOT_CACHE[robot_url] = rp

    if rp is None or rp.is_allowed("*", self.url):
      base_url = urlunsplit([scheme, netloc, "", "", ""])

      # If there's a current delay on the site respect robots.txt and stall
      if self.db.exists(netloc):
        logging.debug("Obeying robot overlord for %s..." % netloc)
        URLHandler.add_to_busy(self.db, self.url)
        return False

      # Set a delay for any other requests to this site to respect robots.txt
      delay = rp.get_crawl_delay("*") if rp else None
      if delay:
        delay = int(math.ceil(float(rp.get_crawl_delay("*"))))
      else:
        delay = SETTINGS["DEFAULT_ROBOTS_DELAY"]
      self.db.setex(netloc, "1", delay)

      return True
    else:
      return False
Esempio n. 4
0
class Robot:
    def __init__(self, url):
        self.url = Url(urljoin(url, '/robots.txt'))
        self.rerp = RobotExclusionRulesParser()
        self.rerp.user_agent = 'Mozilla/5.0'
        self.rerp.fetch(self.url.url())

    def throttle_time(self):
        return self.rerp.get_crawl_delay('Mozilla/5.0')

    def should_block(self, url):
        return not self.rerp.is_allowed('Mozilla/5.0', url.url())