def __init__(self, worker, settings, stop_event, no_scoring=False, **kwargs): super(ScoringConsumer, self).__init__(worker, settings, stop_event, **kwargs) if no_scoring: raise NotConfigured('ScoringConsumer is disabled with --no-scoring') if not isinstance(worker.backend, DistributedBackend): raise NotConfigured('Strategy is disabled for non-distributed backend') scoring_log = worker.message_bus.scoring_log() self.scoring_log_consumer = scoring_log.consumer() self.scoring_log_consumer_batch_size = settings.get('SCORING_LOG_CONSUMER_BATCH_SIZE') self.backend_queue = worker.backend.queue
def __init__(self, worker, settings, stop_event, no_incoming=False, **kwargs): super(IncomingConsumer, self).__init__(worker, settings, stop_event, **kwargs) if no_incoming: raise NotConfigured( 'IncomingConsumer is disabled with --no-incoming') spider_log = worker.message_bus.spider_log() self.spider_log_consumer = spider_log.consumer(partition_id=None, type=b'db') self.spider_log_consumer_batch_size = settings.get( 'SPIDER_LOG_CONSUMER_BATCH_SIZE') # spider-feed is required only to handle 'offset' messages: # check lag to decide if mark feed producer as busy or ready # XXX not implemented for kafka message bus self.spider_feed = worker.message_bus.spider_feed() self.spider_feed_producer = self.spider_feed.producer() self.backend = worker.backend self.max_next_requests = settings.MAX_NEXT_REQUESTS
def __init__(self, worker, settings, stop_event, no_batches=False, partitions=None, **kwargs): super(BatchGenerator, self).__init__(worker, settings, stop_event, **kwargs) if no_batches: raise NotConfigured('BatchGenerator is disabled with --no-batches') self.run_backoff = settings.get('NEW_BATCH_DELAY') self.backend = worker.backend self.spider_feed = worker.message_bus.spider_feed() self.spider_feed_producer = self.spider_feed.producer() self.get_key_function = self.get_fingerprint if settings.get('QUEUE_HOSTNAME_PARTITIONING'): self.get_key_function = self.get_hostname self.domains_blacklist = settings.get('DOMAINS_BLACKLIST') self.max_next_requests = settings.MAX_NEXT_REQUESTS self.partitions = partitions # create an event to disable/enable batches generation via RPC self.disabled_event = threading.Event() # domain statistics logging self.domain_stats = dict([(partition_id, defaultdict(int)) for partition_id in self.partitions]) self.domain_stats_interval = settings.get('DOMAIN_STATS_LOG_INTERVAL') self.rotate_time = time() + self.domain_stats_interval
def _load_components(self, worker, settings, **kwargs): # each component is stored as (cls, instance) pair components = {} for cls in ALL_COMPONENTS: try: component = cls(worker, settings, stop_event=self.stop_event, **kwargs) except NotConfigured: logger.info("Component {} is disabled".format(cls.NAME)) else: components[cls] = component if not components: raise NotConfigured("No components to run, please check your input args") return components