コード例 #1
0
    def __init__(self, worker, settings, stop_event, no_scoring=False, **kwargs):
        super(ScoringConsumer, self).__init__(worker, settings, stop_event, **kwargs)
        if no_scoring:
            raise NotConfigured('ScoringConsumer is disabled with --no-scoring')
        if not isinstance(worker.backend, DistributedBackend):
            raise NotConfigured('Strategy is disabled for non-distributed backend')

        scoring_log = worker.message_bus.scoring_log()
        self.scoring_log_consumer = scoring_log.consumer()
        self.scoring_log_consumer_batch_size = settings.get('SCORING_LOG_CONSUMER_BATCH_SIZE')
        self.backend_queue = worker.backend.queue
コード例 #2
0
    def __init__(self,
                 worker,
                 settings,
                 stop_event,
                 no_incoming=False,
                 **kwargs):
        super(IncomingConsumer, self).__init__(worker, settings, stop_event,
                                               **kwargs)
        if no_incoming:
            raise NotConfigured(
                'IncomingConsumer is disabled with --no-incoming')

        spider_log = worker.message_bus.spider_log()
        self.spider_log_consumer = spider_log.consumer(partition_id=None,
                                                       type=b'db')
        self.spider_log_consumer_batch_size = settings.get(
            'SPIDER_LOG_CONSUMER_BATCH_SIZE')

        # spider-feed is required only to handle 'offset' messages:
        # check lag to decide if mark feed producer as busy or ready
        # XXX not implemented for kafka message bus
        self.spider_feed = worker.message_bus.spider_feed()
        self.spider_feed_producer = self.spider_feed.producer()

        self.backend = worker.backend
        self.max_next_requests = settings.MAX_NEXT_REQUESTS
コード例 #3
0
    def __init__(self,
                 worker,
                 settings,
                 stop_event,
                 no_batches=False,
                 partitions=None,
                 **kwargs):
        super(BatchGenerator, self).__init__(worker, settings, stop_event,
                                             **kwargs)
        if no_batches:
            raise NotConfigured('BatchGenerator is disabled with --no-batches')

        self.run_backoff = settings.get('NEW_BATCH_DELAY')
        self.backend = worker.backend
        self.spider_feed = worker.message_bus.spider_feed()
        self.spider_feed_producer = self.spider_feed.producer()

        self.get_key_function = self.get_fingerprint
        if settings.get('QUEUE_HOSTNAME_PARTITIONING'):
            self.get_key_function = self.get_hostname

        self.domains_blacklist = settings.get('DOMAINS_BLACKLIST')
        self.max_next_requests = settings.MAX_NEXT_REQUESTS
        self.partitions = partitions
        # create an event to disable/enable batches generation via RPC
        self.disabled_event = threading.Event()

        # domain statistics logging
        self.domain_stats = dict([(partition_id, defaultdict(int))
                                  for partition_id in self.partitions])
        self.domain_stats_interval = settings.get('DOMAIN_STATS_LOG_INTERVAL')
        self.rotate_time = time() + self.domain_stats_interval
コード例 #4
0
ファイル: db.py プロジェクト: widy28/frontera
 def _load_components(self, worker, settings, **kwargs):
     # each component is stored as (cls, instance) pair
     components = {}
     for cls in ALL_COMPONENTS:
         try:
             component = cls(worker, settings, stop_event=self.stop_event, **kwargs)
         except NotConfigured:
             logger.info("Component {} is disabled".format(cls.NAME))
         else:
             components[cls] = component
     if not components:
         raise NotConfigured("No components to run, please check your input args")
     return components