def __init__(self, settings, log_handler): """ Initialize the abstract base frontier and this implementation with the different configuration parameters. """ prio_clazz = import_class(settings.PRIORITIZER_CLASS) AbstractBaseFrontier.__init__( self, settings, log_handler, SQLiteMultipleHostUriQueue(settings.FRONTIER_STATE_FILE), prio_clazz(settings)) self._delay_factor = settings.FRONTIER_CRAWL_DELAY_FACTOR self._min_delay = settings.FRONTIER_MIN_DELAY self._num_active_queues = settings.FRONTIER_ACTIVE_QUEUES self._max_queue_budget = settings.FRONTIER_QUEUE_BUDGET self._budget_punishment = settings.FRONTIER_QUEUE_BUDGET_PUNISH self._queue_ids = [] for (queue, _) in self._front_end_queues.get_all_queues(): self._queue_ids.append(queue) qs_clazz = import_class(settings.QUEUE_SELECTOR_CLASS) self._backend_selector = qs_clazz(len(self._queue_ids)) qa_clazz = import_class(settings.QUEUE_ASSIGNMENT_CLASS) self._backend_assignment = qa_clazz(self._dns_cache) self._current_queues = dict() self._current_queues_in_heap = [] self._time_politeness = dict() self._budget_politeness = dict()
def __init__(self, settings, log_handler): """ Initialize the abstract base frontier and this implementation with the different configuration parameters. """ prio_clazz = import_class(settings.PRIORITIZER_CLASS) AbstractBaseFrontier.__init__(self, settings, log_handler, SQLiteMultipleHostUriQueue(settings.FRONTIER_STATE_FILE), prio_clazz(settings)) self._delay_factor = settings.FRONTIER_CRAWL_DELAY_FACTOR self._min_delay = settings.FRONTIER_MIN_DELAY self._num_active_queues = settings.FRONTIER_ACTIVE_QUEUES self._max_queue_budget = settings.FRONTIER_QUEUE_BUDGET self._budget_punishment = settings.FRONTIER_QUEUE_BUDGET_PUNISH self._queue_ids = [] for (queue, _) in self._front_end_queues.get_all_queues(): self._queue_ids.append(queue) qs_clazz = import_class(settings.QUEUE_SELECTOR_CLASS) self._backend_selector = qs_clazz(len(self._queue_ids)) qa_clazz = import_class(settings.QUEUE_ASSIGNMENT_CLASS) self._backend_assignment = qa_clazz(self._dns_cache) self._current_queues = dict() self._current_queues_in_heap = [] self._time_politeness = dict() self._budget_politeness = dict()
def __init__(self, settings, log_handler): """ Initialize the base frontier. """ prio_clazz = import_class(settings.PRIORITIZER_CLASS) AbstractBaseFrontier.__init__(self, settings, log_handler, SQLiteSingleHostUriQueue(settings.FRONTIER_STATE_FILE), prio_clazz(settings)) self._crawl_delay = settings.FRONTIER_CRAWL_DELAY_FACTOR self._min_delay = settings.FRONTIER_MIN_DELAY self._next_possible_crawl = time.time()
def __init__(self, settings, log_handler): """ Initialize the base frontier. """ prio_clazz = import_class(settings.PRIORITIZER_CLASS) AbstractBaseFrontier.__init__( self, settings, log_handler, SQLiteSingleHostUriQueue(settings.FRONTIER_STATE_FILE), prio_clazz(settings)) self._crawl_delay = settings.FRONTIER_CRAWL_DELAY_FACTOR self._min_delay = settings.FRONTIER_MIN_DELAY self._next_possible_crawl = time.time()
def create_processing_function(settings, pipeline): """ Create a processing method that iterates all processors over the incoming message. """ processors = [] for processor in pipeline: processor_class = import_class(processor) processors.append(processor_class(settings)) def processing(data_message): """ The actual processing function calling each configured processor in the order they have been configured. """ next_message = data_message for processor in processors: next_message = processor(next_message) return next_message return processing
def main(settings): """ Main method for master processes. """ # create my own identity identity = "master:%s:%s" % (socket.gethostname(), os.getpid()) ctx = zmq.Context() io_loop = IOLoop.instance() # initialize the logging subsystem log_pub = ctx.socket(zmq.PUB) log_pub.connect(settings.ZEROMQ_LOGGING) zmq_logging_handler = PUBHandler(log_pub) zmq_logging_handler.root_topic = "spyder.master" logger = logging.getLogger() logger.addHandler(zmq_logging_handler) logger.setLevel(settings.LOG_LEVEL_MASTER) logger.info("process::Starting up the master") mgmt = create_master_management(settings, ctx, io_loop) frontier = create_frontier(settings, zmq_logging_handler) publishing_socket = ctx.socket(zmq.PUSH) publishing_socket.setsockopt(zmq.HWM, settings.ZEROMQ_MASTER_PUSH_HWM) publishing_socket.bind(settings.ZEROMQ_MASTER_PUSH) receiving_socket = ctx.socket(zmq.SUB) receiving_socket.setsockopt(zmq.SUBSCRIBE, "") receiving_socket.bind(settings.ZEROMQ_MASTER_SUB) master = ZmqMaster(settings, identity, receiving_socket, publishing_socket, mgmt, frontier, zmq_logging_handler, settings.LOG_LEVEL_MASTER, io_loop) def handle_shutdown_signal(_sig, _frame): """ Called from the os when a shutdown signal is fired. """ master.shutdown() # zmq 2.1 stops blocking calls, restart the ioloop io_loop.start() # handle kill signals signal.signal(signal.SIGINT, handle_shutdown_signal) signal.signal(signal.SIGTERM, handle_shutdown_signal) if settings.MASTER_CALLBACK: callback = import_class(settings.MASTER_CALLBACK) callback(settings, ctx, io_loop, frontier) mgmt.start() master.start() # this will block until the master stops try: io_loop.start() except ZMQError: logger.debug("Caught a ZMQError. Hopefully during shutdown") logger.debug(traceback.format_exc()) master.close() mgmt.close() logger.info("process::Master is down.") log_pub.close() ctx.term()
def create_frontier(settings, log_handler): """ Create the frontier to use. """ frontier = import_class(settings.FRONTIER_CLASS) return frontier(settings, log_handler)