コード例 #1
0
ファイル: Crawler.py プロジェクト: GunioRobot/heroshi
    def __init__(self, max_connections, input_is_plain):
        self.max_connections = max_connections
        self.input_is_plain = input_is_plain

        self.queue = Queue(1)
        self.closed = False
        self._handler_pool = GreenPool(self.max_connections)
        self._robots_cache = PoolMap(self.get_robots_checker, pool_max_size=1, timeout=600)

        # Start IO worker and die if he does.
        self.io_worker = io.Worker(lambda: self.closed)
        t = spawn(self.io_worker.run_loop)
        t.link(reraise_errors, greenthread.getcurrent())

        log.debug(u"Crawler started. Max connections: %d.", self.max_connections)
コード例 #2
0
ファイル: Crawler.py プロジェクト: GunioRobot/heroshi
class Crawler(object):
    def __init__(self, max_connections, input_is_plain):
        self.max_connections = max_connections
        self.input_is_plain = input_is_plain

        self.queue = Queue(1)
        self.closed = False
        self._handler_pool = GreenPool(self.max_connections)
        self._robots_cache = PoolMap(self.get_robots_checker, pool_max_size=1, timeout=600)

        # Start IO worker and die if he does.
        self.io_worker = io.Worker(lambda: self.closed)
        t = spawn(self.io_worker.run_loop)
        t.link(reraise_errors, greenthread.getcurrent())

        log.debug(u"Crawler started. Max connections: %d.", self.max_connections)

    def crawl(self, forever=True):
        # TODO: do something special about signals?

        if forever:
            self.start_queue_updater()

        while not self.closed:
            # `get_nowait` will only work together with sleep(0) here
            # because we need greenlet switch to reraise exception from `do_process`.
            sleep()
            try:
                item = self.queue.get_nowait()
            except Empty:
                if not forever:
                    self.graceful_stop()
                sleep(0.01)
                continue
            t = self._handler_pool.spawn(self.do_process, item)
            t.link(reraise_errors, greenthread.getcurrent())

    def stop(self):
        self.closed = True

    def graceful_stop(self, timeout=None):
        """Stops crawler and waits for all already started crawling requests to finish.

        If `timeout` is supplied, it waits for at most `timeout` time to finish
            and returns True if allocated time was enough.
            Returns False if `timeout` was not enough.
        """
        self.closed = True
        if timeout is not None:
            with eventlet.Timeout(timeout, False):
                if hasattr(self, "_queue_updater_thread"):
                    self._queue_updater_thread.kill()
                self._handler_pool.waitall()
                return True
            return False
        else:
            if hasattr(self, "_queue_updater_thread"):
                self._queue_updater_thread.kill()
            self._handler_pool.waitall()

    def start_queue_updater(self):
        self._queue_updater_thread = spawn(self.queue_updater)
        self._queue_updater_thread.link(reraise_errors, greenthread.getcurrent())

    def queue_updater(self):
        log.debug("Waiting for crawl jobs on stdin.")
        for line in sys.stdin:
            if self.closed:
                break

            line = line.strip()

            if self.input_is_plain:
                job = {"url": line}
            else:
                try:
                    job = json.loads(line)
                except ValueError:
                    log.error(u"Decoding input line: %s", line)
                    continue

            # extend worker queue
            # 1. skip duplicate URLs
            for queue_item in self.queue.queue:
                if queue_item["url"] == job["url"]:  # compare URLs
                    break
            else:
                # 2. extend queue with new items
                # May block here, when queue is full. This is a feature.
                self.queue.put(job)

        # Stdin exhausted -> stop.
        while not self.queue.empty():
            sleep(0.01)

        sleep(2)  # FIXME: Crutch to prevent stopping too early.

        self.graceful_stop()

    def get_robots_checker(self, scheme, authority):
        """PoolMap func :: scheme, authority -> (agent, uri -> bool)."""
        robots_uri = "%s://%s/robots.txt" % (scheme, authority)

        fetch_result = self.io_worker.fetch(robots_uri)
        # Graceful stop thing.
        if fetch_result is None:
            return None

        if fetch_result["success"]:
            # TODO: set expiration time from headers
            # but this must be done after `self._robots_cache.put` or somehow else...
            if 200 <= fetch_result["status_code"] < 300:
                parser = robotparser.RobotFileParser()
                content_lines = fetch_result["content"].splitlines()
                try:
                    parser.parse(content_lines)
                except KeyError:
                    raise RobotsError(u"Known robotparser bug: KeyError at urllib.quote(path).")
                return parser.can_fetch
            # Authorization required and Forbidden are considered Disallow all.
            elif fetch_result["status_code"] in (401, 403):
                return lambda _agent, _uri: False
            # /robots.txt Not Found is considered Allow all.
            elif fetch_result["status_code"] == 404:
                return lambda _agent, _uri: True
            # FIXME: this is an optimistic rule and probably should be detailed with more specific checks
            elif fetch_result["status_code"] >= 400:
                return lambda _agent, _uri: True
            # What other cases left? 100 and redirects. Consider it Disallow all.
            else:
                return lambda _agent, _uri: False
        else:
            raise FetchError(u"/robots.txt fetch problem: %s" % (fetch_result["result"]))

    def ask_robots(self, uri, scheme, authority):
        key = scheme + ":" + authority
        with self._robots_cache.getc(key, scheme, authority) as checker:
            try:
                # Graceful stop thing.
                if checker is None:
                    return None
                return checker(settings.identity["name"], uri)
            except Exception, e:
                log.exception(u"Get rid of this. ask_robots @ %s", uri)
                raise RobotsError(u"Error checking robots.txt permissions for URI '%s': %s" % (uri, unicode(e)))