Example #1
0
    def __init__(self, settings, identity, insocket, outsocket, mgmt, frontier,
            log_handler, log_level, io_loop):
        """
        Initialize the master.
        """
        LoggingMixin.__init__(self, log_handler, log_level)
        self._identity = identity
        self._io_loop = io_loop or IOLoop.instance()

        self._in_stream = ZMQStream(insocket, io_loop)
        self._out_stream = ZMQStream(outsocket, io_loop)

        self._mgmt = mgmt
        self._frontier = frontier

        self._running = False
        self._available_workers = []

        # periodically check if there are pending URIs to crawl
        self._periodic_update = PeriodicCallback(self._send_next_uri,
                settings.MASTER_PERIODIC_UPDATE_INTERVAL, io_loop=io_loop)
        # start this periodic callback when you are waiting for the workers to
        # finish
        self._periodic_shutdown = PeriodicCallback(self._shutdown_wait, 500,
                io_loop=io_loop)
        self._shutdown_counter = 0
        self._logger.debug("zmqmaster::initialized")
Example #2
0
    def __init__(self,
                 settings,
                 log_handler,
                 front_end_queues,
                 prioritizer,
                 unique_hash='sha1'):
        """
        Initialize the frontier and instantiate the
        :class:`SQLiteSingleHostUriQueue`.

        The default frontier we will use the `sha1` hash function for the
        unique uri filter. For very large crawls you might want to use a
        larger hash function (`sha512`, e.g.)
        """
        LoggingMixin.__init__(self, log_handler, settings.LOG_LEVEL_MASTER)
        # front end queue
        self._prioritizer = prioritizer
        self._front_end_queues = front_end_queues
        # checkpointing
        self._checkpoint_interval = settings.FRONTIER_CHECKPOINTING
        self._uris_added = 0

        # the heap
        self._heap = PriorityQueue(maxsize=settings.FRONTIER_HEAP_SIZE)
        self._heap_min_size = settings.FRONTIER_HEAP_MIN

        # a list of uris currently being crawled.
        self._current_uris = dict()
        # dns cache
        self._dns_cache = DnsCache(settings)
        # unique uri filter
        self._unique_uri = UniqueUriFilter(unique_hash)
        for url in self._front_end_queues.all_uris():
            assert not self._unique_uri.is_known(url, add_if_unknown=True)

        # the sinks
        self._sinks = []

        # timezone
        self._timezone = settings.LOCAL_TIMEZONE
        self._logger.info("frontier::initialized")
Example #3
0
    def __init__(self, insocket, outsocket, mgmt, processing, log_handler,
            log_level, io_loop=None):
        """
        Initialize the `ZMQStream` with the `insocket` and `io_loop` and store
        the `outsocket`.

        `insocket` should be of the type `zmq.socket.PULL` `outsocket` should
        be of the type `zmq.socket.PUB`

        `mgmt` is an instance of `spyder.core.mgmt.ZmqMgmt` that handles
        communication between master and worker processes.
        """
        LoggingMixin.__init__(self, log_handler, log_level)

        self._insocket = insocket
        self._io_loop = io_loop or IOLoop.instance()
        self._outsocket = outsocket

        self._processing = processing
        self._mgmt = mgmt
        self._in_stream = ZMQStream(self._insocket, self._io_loop)
        self._out_stream = ZMQStream(self._outsocket, self._io_loop)
Example #4
0
    def __init__(self, settings, log_handler, front_end_queues, prioritizer,
        unique_hash='sha1'):
        """
        Initialize the frontier and instantiate the
        :class:`SQLiteSingleHostUriQueue`.

        The default frontier we will use the `sha1` hash function for the
        unique uri filter. For very large crawls you might want to use a
        larger hash function (`sha512`, e.g.)
        """
        LoggingMixin.__init__(self, log_handler, settings.LOG_LEVEL_MASTER)
        # front end queue
        self._prioritizer = prioritizer
        self._front_end_queues = front_end_queues
        # checkpointing
        self._checkpoint_interval = settings.FRONTIER_CHECKPOINTING
        self._uris_added = 0

        # the heap
        self._heap = PriorityQueue(maxsize=settings.FRONTIER_HEAP_SIZE)
        self._heap_min_size = settings.FRONTIER_HEAP_MIN

        # a list of uris currently being crawled.
        self._current_uris = dict()
        # dns cache
        self._dns_cache = DnsCache(settings)
        # unique uri filter
        self._unique_uri = UniqueUriFilter(unique_hash)
        for url in self._front_end_queues.all_uris():
            assert not self._unique_uri.is_known(url, add_if_unknown=True)

        # the sinks
        self._sinks = []

        # timezone
        self._timezone = settings.LOCAL_TIMEZONE
        self._logger.info("frontier::initialized")