def __init__(self, manager): self._manager = manager settings = manager.settings # Kafka connection parameters self._server = settings.get('KAFKA_LOCATION') self._topic_todo = settings.get('OUTGOING_TOPIC', "frontier-todo") self._topic_done = settings.get('INCOMING_TOPIC', "frontier-done") self._group = settings.get('FRONTIER_GROUP', "scrapy-crawler") self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT', 5.0)) self._partition_id = settings.get('SPIDER_PARTITION_ID') # Kafka setup self._conn = KafkaClient(self._server) self._prod = None self._cons = None logger = getLogger("kafka") handler = StreamHandler() logger.addHandler(handler) self._connect_consumer() self._connect_producer() store_content = settings.get('STORE_CONTENT') self._encoder = Encoder(manager.request_model, send_body=store_content) self._decoder = Decoder(manager.request_model, manager.response_model)
def __init__(self, manager): self._manager = manager settings = self._manager.settings messagebus = load_object(settings.get('MESSAGE_BUS')) self.mb = messagebus(settings) store_content = settings.get('STORE_CONTENT') self._encoder = Encoder(manager.request_model, send_body=store_content) self._decoder = Decoder(manager.request_model, manager.response_model) self.spider_log_producer = self.mb.spider_log().producer() spider_feed = self.mb.spider_feed() self.partition_id = int(settings.get('SPIDER_PARTITION_ID')) self.consumer = spider_feed.consumer(partition_id=self.partition_id) self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT')) self._buffer = OverusedBuffer(self._get_next_requests, manager.logger.manager.debug)
def __init__(self, manager): settings = manager.settings messagebus = load_object(settings.get('MESSAGE_BUS')) self.mb = messagebus(settings) store_content = settings.get('STORE_CONTENT') self._encoder = Encoder(manager.request_model, send_body=store_content) self._decoder = Decoder(manager.request_model, manager.response_model) self.spider_log_producer = self.mb.spider_log().producer() spider_feed = self.mb.spider_feed() self.partition_id = int(settings.get('SPIDER_PARTITION_ID')) if self.partition_id < 0 or self.partition_id >= settings.get( 'SPIDER_FEED_PARTITIONS'): raise ValueError( "Spider partition id cannot be less than 0 or more than SPIDER_FEED_PARTITIONS." ) self.consumer = spider_feed.consumer(partition_id=self.partition_id) self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT')) self._logger = logging.getLogger("messagebus-backend") self._buffer = OverusedBuffer(self._get_next_requests, self._logger.debug) self._logger.info("Consuming from partition id %d", self.partition_id)