Beispiel #1
0
    def __init__(self, manager):
        self._manager = manager
        settings = manager.settings

        # Kafka connection parameters
        self._server = settings.get('KAFKA_LOCATION')
        self._topic_todo = settings.get('OUTGOING_TOPIC', "frontier-todo")
        self._topic_done = settings.get('INCOMING_TOPIC', "frontier-done")
        self._group = settings.get('FRONTIER_GROUP', "scrapy-crawler")
        self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT', 5.0))
        self._partition_id = settings.get('SPIDER_PARTITION_ID')

        # Kafka setup
        self._conn = KafkaClient(self._server)
        self._prod = None
        self._cons = None

        logger = getLogger("kafka")
        handler = StreamHandler()
        logger.addHandler(handler)

        self._connect_consumer()
        self._connect_producer()

        store_content = settings.get('STORE_CONTENT')
        self._encoder = Encoder(manager.request_model, send_body=store_content)
        self._decoder = Decoder(manager.request_model, manager.response_model)
Beispiel #2
0
 def __init__(self, manager):
     self._manager = manager
     settings = self._manager.settings
     messagebus = load_object(settings.get('MESSAGE_BUS'))
     self.mb = messagebus(settings)
     store_content = settings.get('STORE_CONTENT')
     self._encoder = Encoder(manager.request_model, send_body=store_content)
     self._decoder = Decoder(manager.request_model, manager.response_model)
     self.spider_log_producer = self.mb.spider_log().producer()
     spider_feed = self.mb.spider_feed()
     self.partition_id = int(settings.get('SPIDER_PARTITION_ID'))
     self.consumer = spider_feed.consumer(partition_id=self.partition_id)
     self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT'))
     self._buffer = OverusedBuffer(self._get_next_requests,
                                   manager.logger.manager.debug)
Beispiel #3
0
 def __init__(self, manager):
     settings = manager.settings
     messagebus = load_object(settings.get('MESSAGE_BUS'))
     self.mb = messagebus(settings)
     store_content = settings.get('STORE_CONTENT')
     self._encoder = Encoder(manager.request_model, send_body=store_content)
     self._decoder = Decoder(manager.request_model, manager.response_model)
     self.spider_log_producer = self.mb.spider_log().producer()
     spider_feed = self.mb.spider_feed()
     self.partition_id = int(settings.get('SPIDER_PARTITION_ID'))
     if self.partition_id < 0 or self.partition_id >= settings.get(
             'SPIDER_FEED_PARTITIONS'):
         raise ValueError(
             "Spider partition id cannot be less than 0 or more than SPIDER_FEED_PARTITIONS."
         )
     self.consumer = spider_feed.consumer(partition_id=self.partition_id)
     self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT'))
     self._logger = logging.getLogger("messagebus-backend")
     self._buffer = OverusedBuffer(self._get_next_requests,
                                   self._logger.debug)
     self._logger.info("Consuming from partition id %d", self.partition_id)