def test_hashed_partitioner(self): partitions = self.client.get_partition_ids_for_topic(self.topic) start_offsets = [self.current_offset(self.topic, p) for p in partitions] producer = KeyedProducer(self.client, partitioner=HashedPartitioner) resp1 = producer.send_messages(self.topic, self.key("1"), self.msg("one")) resp2 = producer.send_messages(self.topic, self.key("2"), self.msg("two")) resp3 = producer.send_messages(self.topic, self.key("3"), self.msg("three")) resp4 = producer.send_messages(self.topic, self.key("3"), self.msg("four")) resp5 = producer.send_messages(self.topic, self.key("4"), self.msg("five")) offsets = {partitions[0]: start_offsets[0], partitions[1]: start_offsets[1]} messages = {partitions[0]: [], partitions[1]: []} keys = [self.key(k) for k in ["1", "2", "3", "3", "4"]] resps = [resp1, resp2, resp3, resp4, resp5] msgs = [self.msg(m) for m in ["one", "two", "three", "four", "five"]] for key, resp, msg in zip(keys, resps, msgs): k = hash(key) % 2 partition = partitions[k] offset = offsets[partition] self.assert_produce_response(resp, offset) offsets[partition] += 1 messages[partition].append(msg) self.assert_fetch_offset(partitions[0], start_offsets[0], messages[partitions[0]]) self.assert_fetch_offset(partitions[1], start_offsets[1], messages[partitions[1]]) producer.stop()
def test_hashed_partitioner(self): start_offset0 = self.current_offset(self.topic, 0) start_offset1 = self.current_offset(self.topic, 1) producer = KeyedProducer(self.client, partitioner=HashedPartitioner) resp1 = producer.send(self.topic, self.key("1"), self.msg("one")) resp2 = producer.send(self.topic, self.key("2"), self.msg("two")) resp3 = producer.send(self.topic, self.key("3"), self.msg("three")) resp4 = producer.send(self.topic, self.key("3"), self.msg("four")) resp5 = producer.send(self.topic, self.key("4"), self.msg("five")) offsets = {0: start_offset0, 1: start_offset1} messages = {0: [], 1: []} keys = [self.key(k) for k in ["1", "2", "3", "3", "4"]] resps = [resp1, resp2, resp3, resp4, resp5] msgs = [self.msg(m) for m in ["one", "two", "three", "four", "five"]] for key, resp, msg in zip(keys, resps, msgs): k = hash(key) % 2 offset = offsets[k] self.assert_produce_response(resp, offset) offsets[k] += 1 messages[k].append(msg) self.assert_fetch_offset(0, start_offset0, messages[0]) self.assert_fetch_offset(1, start_offset1, messages[1]) producer.stop()
def test_keyedproducer_null_payload(self): partitions = self.client.get_partition_ids_for_topic(self.topic) start_offsets = [ self.current_offset(self.topic, p) for p in partitions ] producer = KeyedProducer(self.client, partitioner=RoundRobinPartitioner) key = "test" resp = producer.send_messages(self.topic, self.key("key1"), self.msg("one")) self.assert_produce_response(resp, start_offsets[0]) resp = producer.send_messages(self.topic, self.key("key2"), None) self.assert_produce_response(resp, start_offsets[1]) resp = producer.send_messages(self.topic, self.key("key3"), None) self.assert_produce_response(resp, start_offsets[0] + 1) resp = producer.send_messages(self.topic, self.key("key4"), self.msg("four")) self.assert_produce_response(resp, start_offsets[1] + 1) self.assert_fetch_offset(partitions[0], start_offsets[0], [self.msg("one"), None]) self.assert_fetch_offset(partitions[1], start_offsets[1], [None, self.msg("four")]) producer.stop()
def test_round_robin_partitioner(self): partitions = self.client.get_partition_ids_for_topic(self.topic) start_offsets = [ self.current_offset(self.topic, p) for p in partitions ] producer = KeyedProducer(self.client, partitioner=RoundRobinPartitioner) resp1 = producer.send_messages(self.topic, self.key("key1"), self.msg("one")) resp2 = producer.send_messages(self.topic, self.key("key2"), self.msg("two")) resp3 = producer.send_messages(self.topic, self.key("key3"), self.msg("three")) resp4 = producer.send_messages(self.topic, self.key("key4"), self.msg("four")) self.assert_produce_response(resp1, start_offsets[0] + 0) self.assert_produce_response(resp2, start_offsets[1] + 0) self.assert_produce_response(resp3, start_offsets[0] + 1) self.assert_produce_response(resp4, start_offsets[1] + 1) self.assert_fetch_offset( partitions[0], start_offsets[0], [self.msg("one"), self.msg("three")]) self.assert_fetch_offset( partitions[1], start_offsets[1], [self.msg("two"), self.msg("four")]) producer.stop()
def test_hashed_partitioner(self): start_offset0 = self.current_offset(self.topic, 0) start_offset1 = self.current_offset(self.topic, 1) producer = KeyedProducer(self.client, partitioner=HashedPartitioner) resp1 = producer.send(self.topic, 1, self.msg("one")) resp2 = producer.send(self.topic, 2, self.msg("two")) resp3 = producer.send(self.topic, 3, self.msg("three")) resp4 = producer.send(self.topic, 3, self.msg("four")) resp5 = producer.send(self.topic, 4, self.msg("five")) self.assert_produce_response(resp1, start_offset1 + 0) self.assert_produce_response(resp2, start_offset0 + 0) self.assert_produce_response(resp3, start_offset1 + 1) self.assert_produce_response(resp4, start_offset1 + 2) self.assert_produce_response(resp5, start_offset0 + 1) self.assert_fetch_offset( 0, start_offset0, [self.msg("two"), self.msg("five")]) self.assert_fetch_offset( 1, start_offset1, [self.msg("one"), self.msg("three"), self.msg("four")]) producer.stop()
class KafkaLoggingHandler(logging.Handler): def __init__(self, host, port, topic, key=None): logging.Handler.__init__(self) self.kafka_client = KafkaClient(host, port) self.key = key if key is None: self.producer = SimpleProducer(self.kafka_client, topic) else: self.producer = KeyedProducer(self.kafka_client, topic) def emit(self, record): #drop kafka logging to avoid infinite recursion if record.name == 'kafka': return try: #use default formatting msg = self.format(record) #produce message if self.key is None: self.producer.send_messages(msg) else: self.producer.send(self.key, msg) except: import traceback ei = sys.exc_info() traceback.print_exception(ei[0], ei[1], ei[2], None, sys.stderr) del ei def close(self): self.producer.stop() logging.Handler.close(self)
def sendMsg(topic, lines): if lines.__len__() > 0: brokers = '10.117.181.44:9092,10.117.108.143:9092,10.117.21.79:9092' kafka = KafkaClient(brokers) producer = KeyedProducer(kafka) for line in lines: ran = "_" + str(random.randint(0, 10)) producer.send_messages(topic, topic + ran, line) producer.stop()
def sendMsg(topic, lines): if lines.__len__() > 0: brokers = 'cdh-slave0:9092,cdh-slave1:9092,cdh-slave2:9092' kafka = KafkaClient(brokers) producer = KeyedProducer(kafka) for line in lines: ran = "_" + str(random.randint(0, 10)) producer.send_messages(topic, topic + ran, line) producer.stop()
def test_async_keyed_producer(self): start_offset0 = self.current_offset(self.topic, 0) producer = KeyedProducer(self.client, partitioner = RoundRobinPartitioner, async=True) resp = producer.send(self.topic, self.key("key1"), self.msg("one")) self.assertEqual(len(resp), 0) self.assert_fetch_offset(0, start_offset0, [ self.msg("one") ]) producer.stop()
def test_async_keyed_producer(self): start_offset0 = self.current_offset(self.topic, 0) producer = KeyedProducer(self.client, partitioner=RoundRobinPartitioner, async=True) resp = producer.send(self.topic, self.key("key1"), self.msg("one")) self.assertEqual(len(resp), 0) self.assert_fetch_offset(0, start_offset0, [self.msg("one")]) producer.stop()
class KeyedProducer(BaseStreamProducer): def __init__(self, connection, topic_done, partitioner_cls, codec): self._prod = None self._conn = connection self._topic_done = topic_done self._partitioner_cls = partitioner_cls self._codec = codec def _connect_producer(self): if self._prod is None: try: self._prod = KafkaKeyedProducer( self._conn, partitioner=self._partitioner_cls, codec=self._codec) except BrokerResponseError: self._prod = None logger.warning("Could not connect producer to Kafka server") return False return True def send(self, key, *messages): success = False max_tries = 5 if self._connect_producer(): n_tries = 0 while not success and n_tries < max_tries: try: self._prod.send_messages(self._topic_done, key, *messages) success = True except MessageSizeTooLargeError as e: logger.error(str(e)) break except BrokerResponseError: n_tries += 1 logger.warning( "Could not send message. Try {0}/{1}".format( n_tries, max_tries)) sleep(1.0) return success def flush(self): if self._prod is not None: self._prod.stop() def get_offset(self, partition_id): # Kafka has it's own offset management raise KeyError
def test_async_keyed_producer(self): partition = self.client.get_partition_ids_for_topic(self.topic)[0] start_offset = self.current_offset(self.topic, partition) producer = KeyedProducer(self.client, partitioner = RoundRobinPartitioner, async=True) resp = producer.send_messages(self.topic, self.key("key1"), self.msg("one")) self.assertEqual(len(resp), 0) # wait for the server to report a new highwatermark while self.current_offset(self.topic, partition) == start_offset: time.sleep(0.1) self.assert_fetch_offset(partition, start_offset, [ self.msg("one") ]) producer.stop()
class KeyedProducer(BaseStreamProducer): def __init__(self, connection, topic_done, partitioner_cls, codec): self._prod = None self._conn = connection self._topic_done = topic_done self._partitioner_cls = partitioner_cls self._codec = codec def _connect_producer(self): if self._prod is None: try: self._prod = KafkaKeyedProducer(self._conn, partitioner=self._partitioner_cls, codec=self._codec) except BrokerResponseError: self._prod = None logger.warning("Could not connect producer to Kafka server") return False return True def send(self, key, *messages): success = False max_tries = 5 if self._connect_producer(): n_tries = 0 while not success and n_tries < max_tries: try: self._prod.send_messages(self._topic_done, key, *messages) success = True except MessageSizeTooLargeError as e: logger.error(str(e)) break except BrokerResponseError: n_tries += 1 logger.warning( "Could not send message. Try {0}/{1}".format( n_tries, max_tries) ) sleep(1.0) return success def flush(self): if self._prod is not None: self._prod.stop() def get_offset(self, partition_id): # Kafka has it's own offset management raise KeyError
def test_round_robin_partitioner(self): start_offset0 = self.current_offset(self.topic, 0) start_offset1 = self.current_offset(self.topic, 1) producer = KeyedProducer(self.client, partitioner=RoundRobinPartitioner) resp1 = producer.send(self.topic, self.key("key1"), self.msg("one")) resp2 = producer.send(self.topic, self.key("key2"), self.msg("two")) resp3 = producer.send(self.topic, self.key("key3"), self.msg("three")) resp4 = producer.send(self.topic, self.key("key4"), self.msg("four")) self.assert_produce_response(resp1, start_offset0+0) self.assert_produce_response(resp2, start_offset1+0) self.assert_produce_response(resp3, start_offset0+1) self.assert_produce_response(resp4, start_offset1+1) self.assert_fetch_offset(0, start_offset0, [ self.msg("one"), self.msg("three") ]) self.assert_fetch_offset(1, start_offset1, [ self.msg("two"), self.msg("four") ]) producer.stop()
def test_round_robin_partitioner(self): partitions = self.client.get_partition_ids_for_topic(self.topic) start_offsets = [self.current_offset(self.topic, p) for p in partitions] producer = KeyedProducer(self.client, partitioner=RoundRobinPartitioner) resp1 = producer.send_messages(self.topic, self.key("key1"), self.msg("one")) resp2 = producer.send_messages(self.topic, self.key("key2"), self.msg("two")) resp3 = producer.send_messages(self.topic, self.key("key3"), self.msg("three")) resp4 = producer.send_messages(self.topic, self.key("key4"), self.msg("four")) self.assert_produce_response(resp1, start_offsets[0]+0) self.assert_produce_response(resp2, start_offsets[1]+0) self.assert_produce_response(resp3, start_offsets[0]+1) self.assert_produce_response(resp4, start_offsets[1]+1) self.assert_fetch_offset(partitions[0], start_offsets[0], [ self.msg("one"), self.msg("three") ]) self.assert_fetch_offset(partitions[1], start_offsets[1], [ self.msg("two"), self.msg("four") ]) producer.stop()
def test_keyedproducer_null_payload(self): partitions = self.client.get_partition_ids_for_topic(self.topic) start_offsets = [self.current_offset(self.topic, p) for p in partitions] producer = KeyedProducer(self.client, partitioner=RoundRobinPartitioner) key = "test" resp = producer.send_messages(self.topic, self.key("key1"), self.msg("one")) self.assert_produce_response(resp, start_offsets[0]) resp = producer.send_messages(self.topic, self.key("key2"), None) self.assert_produce_response(resp, start_offsets[1]) resp = producer.send_messages(self.topic, self.key("key3"), None) self.assert_produce_response(resp, start_offsets[0]+1) resp = producer.send_messages(self.topic, self.key("key4"), self.msg("four")) self.assert_produce_response(resp, start_offsets[1]+1) self.assert_fetch_offset(partitions[0], start_offsets[0], [ self.msg("one"), None ]) self.assert_fetch_offset(partitions[1], start_offsets[1], [ None, self.msg("four") ]) producer.stop()
def test_hashed_partitioner(self): start_offset0 = self.current_offset(self.topic, 0) start_offset1 = self.current_offset(self.topic, 1) producer = KeyedProducer(self.client, partitioner=HashedPartitioner) resp1 = producer.send(self.topic, 1, self.msg("one")) resp2 = producer.send(self.topic, 2, self.msg("two")) resp3 = producer.send(self.topic, 3, self.msg("three")) resp4 = producer.send(self.topic, 3, self.msg("four")) resp5 = producer.send(self.topic, 4, self.msg("five")) self.assert_produce_response(resp1, start_offset1+0) self.assert_produce_response(resp2, start_offset0+0) self.assert_produce_response(resp3, start_offset1+1) self.assert_produce_response(resp4, start_offset1+2) self.assert_produce_response(resp5, start_offset0+1) self.assert_fetch_offset(0, start_offset0, [ self.msg("two"), self.msg("five") ]) self.assert_fetch_offset(1, start_offset1, [ self.msg("one"), self.msg("three"), self.msg("four") ]) producer.stop()
def init(): oauth_client = Oauth(config.get('oauth', 'consumer_key'), config.get('oauth', 'consumer_secret'), config.get('oauth', 'request_token_url'), config.get('oauth', 'access_token_url'), config.get('oauth', 'authorize_url'), version=config.get('oauth', 'version')) request = Request(url=config.get('twitter', 'streaming_filter_url'), method="POST", is_streaming=True, headers={'Accept-Encoding': 'deflate, gzip '}, payload={'locations': '-118.39,30.41,-59.61,49.46'}, token=token) max_stream = int(config.get('twitter', 'max_stream_responses')) topic = config.get('kafka', 'topic') max_skip_invalid_responses = config.getint('twitter', 'max_skip_invalid_response') skip_invalid_responses = config.getboolean('twitter', 'skip_invalid') producer = KeyedProducer(kafka_client, async=True) twitter = TwitterStream(oauth_client, json) tweets = twitter.get_tweets(request) # Starts here. try: if max_stream < 0: send_unlimited_messages(tweets, producer, topic) else: send_limited_messages(max_stream, tweets, producer, topic, skip_invalid_responses, max_skip_invalid_responses) except Exception as e: print e finally: producer.stop() kafka_client.close()
def test_round_robin_partitioner(self): start_offset0 = self.current_offset(self.topic, 0) start_offset1 = self.current_offset(self.topic, 1) producer = KeyedProducer(self.client, partitioner=RoundRobinPartitioner) resp1 = producer.send(self.topic, self.key("key1"), self.msg("one")) resp2 = producer.send(self.topic, self.key("key2"), self.msg("two")) resp3 = producer.send(self.topic, self.key("key3"), self.msg("three")) resp4 = producer.send(self.topic, self.key("key4"), self.msg("four")) self.assert_produce_response(resp1, start_offset0 + 0) self.assert_produce_response(resp2, start_offset1 + 0) self.assert_produce_response(resp3, start_offset0 + 1) self.assert_produce_response(resp4, start_offset1 + 1) self.assert_fetch_offset( 0, start_offset0, [self.msg("one"), self.msg("three")]) self.assert_fetch_offset( 1, start_offset1, [self.msg("two"), self.msg("four")]) producer.stop()
class KafkaBackend(Backend): def __init__(self, manager): self._manager = manager settings = manager.settings # Kafka connection parameters self._server = settings.get('KAFKA_LOCATION') self._topic_todo = settings.get('OUTGOING_TOPIC', "frontier-todo") self._topic_done = settings.get('INCOMING_TOPIC', "frontier-done") self._group = settings.get('FRONTIER_GROUP', "scrapy-crawler") self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT', 5.0)) self._partition_id = settings.get('SPIDER_PARTITION_ID') # Kafka setup self._conn = KafkaClient(self._server) self._prod = None self._cons = None logger = getLogger("kafka") handler = StreamHandler() logger.addHandler(handler) self._connect_consumer() self._connect_producer() store_content = settings.get('STORE_CONTENT') self._encoder = Encoder(manager.request_model, send_body=store_content) self._decoder = Decoder(manager.request_model, manager.response_model) def _connect_producer(self): """If producer is not connected try to connect it now. :returns: bool -- True if producer is connected """ if self._prod is None: try: self._prod = KeyedProducer(self._conn, partitioner=FingerprintPartitioner, codec=CODEC_SNAPPY) except BrokerResponseError: self._prod = None if self._manager is not None: self._manager.logger.backend.warning( "Could not connect producer to Kafka server") return False return True def _connect_consumer(self): """If consumer is not connected try to connect it now. :returns: bool -- True if consumer is connected """ if self._cons is None: try: self._cons = SimpleConsumer( self._conn, self._group, self._topic_todo, partitions=[self._partition_id], buffer_size=131072, max_buffer_size=1048576) except BrokerResponseError: self._cons = None if self._manager is not None: self._manager.logger.backend.warning( "Could not connect consumer to Kafka server") return False return True @classmethod def from_manager(clas, manager): return clas(manager) def frontier_start(self): if self._connect_consumer(): self._manager.logger.backend.info( "Successfully connected consumer to " + self._topic_todo) else: self._manager.logger.backend.warning( "Could not connect consumer to {0}. I will try latter.".format( self._topic_todo)) def frontier_stop(self): # flush everything if a batch is incomplete self._prod.stop() def _send_message(self, encoded_message, key, fail_wait_time=1.0, max_tries=5): start = time.clock() success = False if self._connect_producer(): n_tries = 0 while not success and n_tries < max_tries: try: self._prod.send_messages(self._topic_done, key, encoded_message) success = True except MessageSizeTooLargeError, e: self._manager.logger.backend.error(str(e)) self._manager.logger.backend.debug("Message: %s" % encoded_message) break except BrokerResponseError: n_tries += 1 if self._manager is not None: self._manager.logger.backend.warning( "Could not send message. Try {0}/{1}".format( n_tries, max_tries) ) time.sleep(fail_wait_time)
class KafkaBackend(Backend): def __init__(self, manager): self._manager = manager settings = manager.settings # Kafka connection parameters self._server = settings.get('KAFKA_LOCATION') self._topic_todo = settings.get('OUTGOING_TOPIC', "frontier-todo") self._topic_done = settings.get('INCOMING_TOPIC', "frontier-done") self._group = settings.get('FRONTIER_GROUP', "scrapy-crawler") self._get_timeout = float(settings.get('KAFKA_GET_TIMEOUT', 5.0)) self._partition_id = settings.get('SPIDER_PARTITION_ID') # Kafka setup self._conn = KafkaClient(self._server) self._prod = None self._cons = None logger = getLogger("kafka") handler = StreamHandler() logger.addHandler(handler) self._connect_consumer() self._connect_producer() store_content = settings.get('STORE_CONTENT') self._encoder = Encoder(manager.request_model, send_body=store_content) self._decoder = Decoder(manager.request_model, manager.response_model) def _connect_producer(self): """If producer is not connected try to connect it now. :returns: bool -- True if producer is connected """ if self._prod is None: try: self._prod = KeyedProducer(self._conn, partitioner=FingerprintPartitioner, codec=CODEC_SNAPPY) except BrokerResponseError: self._prod = None if self._manager is not None: self._manager.logger.backend.warning( "Could not connect producer to Kafka server") return False return True def _connect_consumer(self): """If consumer is not connected try to connect it now. :returns: bool -- True if consumer is connected """ if self._cons is None: try: self._cons = SimpleConsumer(self._conn, self._group, self._topic_todo, partitions=[self._partition_id], buffer_size=131072, max_buffer_size=1048576) except BrokerResponseError: self._cons = None if self._manager is not None: self._manager.logger.backend.warning( "Could not connect consumer to Kafka server") return False return True @classmethod def from_manager(clas, manager): return clas(manager) def frontier_start(self): if self._connect_consumer(): self._manager.logger.backend.info( "Successfully connected consumer to " + self._topic_todo) else: self._manager.logger.backend.warning( "Could not connect consumer to {0}. I will try latter.".format( self._topic_todo)) def frontier_stop(self): # flush everything if a batch is incomplete self._prod.stop() def _send_message(self, encoded_message, key, fail_wait_time=1.0, max_tries=5): start = time.clock() success = False if self._connect_producer(): n_tries = 0 while not success and n_tries < max_tries: try: self._prod.send_messages(self._topic_done, key, encoded_message) success = True except MessageSizeTooLargeError, e: self._manager.logger.backend.error(str(e)) self._manager.logger.backend.debug("Message: %s" % encoded_message) break except BrokerResponseError: n_tries += 1 if self._manager is not None: self._manager.logger.backend.warning( "Could not send message. Try {0}/{1}".format( n_tries, max_tries)) time.sleep(fail_wait_time)