Esempio n. 1
0
    def __init__(self, settings, log_handler):
        """
        Initialize the abstract base frontier and this implementation with the
        different configuration parameters.
        """
        prio_clazz = import_class(settings.PRIORITIZER_CLASS)
        AbstractBaseFrontier.__init__(
            self, settings, log_handler,
            SQLiteMultipleHostUriQueue(settings.FRONTIER_STATE_FILE),
            prio_clazz(settings))

        self._delay_factor = settings.FRONTIER_CRAWL_DELAY_FACTOR
        self._min_delay = settings.FRONTIER_MIN_DELAY
        self._num_active_queues = settings.FRONTIER_ACTIVE_QUEUES
        self._max_queue_budget = settings.FRONTIER_QUEUE_BUDGET
        self._budget_punishment = settings.FRONTIER_QUEUE_BUDGET_PUNISH

        self._queue_ids = []
        for (queue, _) in self._front_end_queues.get_all_queues():
            self._queue_ids.append(queue)

        qs_clazz = import_class(settings.QUEUE_SELECTOR_CLASS)
        self._backend_selector = qs_clazz(len(self._queue_ids))

        qa_clazz = import_class(settings.QUEUE_ASSIGNMENT_CLASS)
        self._backend_assignment = qa_clazz(self._dns_cache)

        self._current_queues = dict()
        self._current_queues_in_heap = []
        self._time_politeness = dict()
        self._budget_politeness = dict()
Esempio n. 2
0
    def __init__(self, settings, log_handler):
        """
        Initialize the abstract base frontier and this implementation with the
        different configuration parameters.
        """
        prio_clazz = import_class(settings.PRIORITIZER_CLASS)
        AbstractBaseFrontier.__init__(self, settings, log_handler,
                SQLiteMultipleHostUriQueue(settings.FRONTIER_STATE_FILE),
                prio_clazz(settings))

        self._delay_factor = settings.FRONTIER_CRAWL_DELAY_FACTOR
        self._min_delay = settings.FRONTIER_MIN_DELAY
        self._num_active_queues = settings.FRONTIER_ACTIVE_QUEUES
        self._max_queue_budget = settings.FRONTIER_QUEUE_BUDGET
        self._budget_punishment = settings.FRONTIER_QUEUE_BUDGET_PUNISH

        self._queue_ids = []
        for (queue, _) in self._front_end_queues.get_all_queues():
            self._queue_ids.append(queue)

        qs_clazz = import_class(settings.QUEUE_SELECTOR_CLASS)
        self._backend_selector = qs_clazz(len(self._queue_ids))

        qa_clazz = import_class(settings.QUEUE_ASSIGNMENT_CLASS)
        self._backend_assignment = qa_clazz(self._dns_cache)

        self._current_queues = dict()
        self._current_queues_in_heap = []
        self._time_politeness = dict()
        self._budget_politeness = dict()
Esempio n. 3
0
    def __init__(self, settings, log_handler):
        """
        Initialize the base frontier.
        """
        prio_clazz = import_class(settings.PRIORITIZER_CLASS)
        AbstractBaseFrontier.__init__(self, settings, log_handler,
                SQLiteSingleHostUriQueue(settings.FRONTIER_STATE_FILE),
                prio_clazz(settings))

        self._crawl_delay = settings.FRONTIER_CRAWL_DELAY_FACTOR
        self._min_delay = settings.FRONTIER_MIN_DELAY
        self._next_possible_crawl = time.time()
Esempio n. 4
0
    def __init__(self, settings, log_handler):
        """
        Initialize the base frontier.
        """
        prio_clazz = import_class(settings.PRIORITIZER_CLASS)
        AbstractBaseFrontier.__init__(
            self, settings, log_handler,
            SQLiteSingleHostUriQueue(settings.FRONTIER_STATE_FILE),
            prio_clazz(settings))

        self._crawl_delay = settings.FRONTIER_CRAWL_DELAY_FACTOR
        self._min_delay = settings.FRONTIER_MIN_DELAY
        self._next_possible_crawl = time.time()
Esempio n. 5
0
def create_processing_function(settings, pipeline):
    """
    Create a processing method that iterates all processors over the incoming
    message.
    """
    processors = []
    for processor in pipeline:
        processor_class = import_class(processor)
        processors.append(processor_class(settings))

    def processing(data_message):
        """
        The actual processing function calling each configured processor in the
        order they have been configured.
        """
        next_message = data_message
        for processor in processors:
            next_message = processor(next_message)
        return next_message

    return processing
Esempio n. 6
0
def main(settings):
    """
    Main method for master processes.
    """
    # create my own identity
    identity = "master:%s:%s" % (socket.gethostname(), os.getpid())

    ctx = zmq.Context()
    io_loop = IOLoop.instance()

    # initialize the logging subsystem
    log_pub = ctx.socket(zmq.PUB)
    log_pub.connect(settings.ZEROMQ_LOGGING)
    zmq_logging_handler = PUBHandler(log_pub)
    zmq_logging_handler.root_topic = "spyder.master"
    logger = logging.getLogger()
    logger.addHandler(zmq_logging_handler)
    logger.setLevel(settings.LOG_LEVEL_MASTER)

    logger.info("process::Starting up the master")

    mgmt = create_master_management(settings, ctx, io_loop)
    frontier = create_frontier(settings, zmq_logging_handler)

    publishing_socket = ctx.socket(zmq.PUSH)
    publishing_socket.setsockopt(zmq.HWM, settings.ZEROMQ_MASTER_PUSH_HWM)
    publishing_socket.bind(settings.ZEROMQ_MASTER_PUSH)

    receiving_socket = ctx.socket(zmq.SUB)
    receiving_socket.setsockopt(zmq.SUBSCRIBE, "")
    receiving_socket.bind(settings.ZEROMQ_MASTER_SUB)

    master = ZmqMaster(settings, identity, receiving_socket, publishing_socket,
                       mgmt, frontier, zmq_logging_handler,
                       settings.LOG_LEVEL_MASTER, io_loop)

    def handle_shutdown_signal(_sig, _frame):
        """
        Called from the os when a shutdown signal is fired.
        """
        master.shutdown()
        # zmq 2.1 stops blocking calls, restart the ioloop
        io_loop.start()

    # handle kill signals
    signal.signal(signal.SIGINT, handle_shutdown_signal)
    signal.signal(signal.SIGTERM, handle_shutdown_signal)

    if settings.MASTER_CALLBACK:
        callback = import_class(settings.MASTER_CALLBACK)
        callback(settings, ctx, io_loop, frontier)

    mgmt.start()
    master.start()

    # this will block until the master stops
    try:
        io_loop.start()
    except ZMQError:
        logger.debug("Caught a ZMQError. Hopefully during shutdown")
        logger.debug(traceback.format_exc())

    master.close()
    mgmt.close()

    logger.info("process::Master is down.")
    log_pub.close()

    ctx.term()
Esempio n. 7
0
def create_frontier(settings, log_handler):
    """
    Create the frontier to use.
    """
    frontier = import_class(settings.FRONTIER_CLASS)
    return frontier(settings, log_handler)