def __init__( self, spark, df_schema, key_deserializer, value_deserializer, host, topic, port=9092, ): """Initialize context manager Parameters `key_deserializer` and `value_deserializer` are callables which get bytes as input and should return python structures as output. Args: spark (SparklySession): currently active SparklySession df_schema (pyspark.sql.types.StructType): schema of dataframe to be generated key_deserializer (function): function used to deserialize the key value_deserializer (function): function used to deserialize the value host (basestring): host or ip address of the kafka server to connect to topic (basestring): Kafka topic to monitor port (int): port number of the Kafka server to connect to """ self.spark = spark self.topic = topic self.df_schema = df_schema self.key_deser, self.val_deser = key_deserializer, value_deserializer self.host, self.port = host, port self._df = None self.count = 0 kafka_client = SimpleClient(host) kafka_client.ensure_topic_exists(topic)
class KafkaIntegrationTestCase(unittest.TestCase): create_client = True topic = None zk = None server = None def setUp(self): super(KafkaIntegrationTestCase, self).setUp() if not os.environ.get('KAFKA_VERSION'): self.skipTest('Integration test requires KAFKA_VERSION') if not self.topic: topic = "%s-%s" % (self.id()[self.id().rindex(".") + 1:], random_string(10)) self.topic = topic if self.create_client: self.client = SimpleClient('%s:%d' % (self.server.host, self.server.port)) self.client_async = KafkaClient( bootstrap_servers='%s:%d' % (self.server.host, self.server.port)) self.client.ensure_topic_exists(self.topic) self._messages = {} def tearDown(self): super(KafkaIntegrationTestCase, self).tearDown() if not os.environ.get('KAFKA_VERSION'): return if self.create_client: self.client.close() def current_offset(self, topic, partition): try: offsets, = self.client.send_offset_request( [OffsetRequestPayload(topic, partition, -1, 1)]) except: # XXX: We've seen some UnknownErrors here and can't debug w/o server logs self.zk.child.dump_logs() self.server.child.dump_logs() raise else: return offsets.offsets[0] def msgs(self, iterable): return [self.msg(x) for x in iterable] def msg(self, s): if s not in self._messages: self._messages[s] = '%s-%s-%s' % (s, self.id(), str(uuid.uuid4())) return self._messages[s].encode('utf-8') def key(self, k): return k.encode('utf-8')
def send_to_kafka(message): producer = get_producer() try: producer.send(settings.KAFKA_TOPIC, message) except: client = SimpleClient(hosts=settings.KAFKA_SERVERS) client.ensure_topic_exists(settings.KAFKA_TOPIC) client.close() producer.send(settings.KAFKA_TOPIC, message) producer.close(10)
class KafkaIntegrationTestCase(unittest.TestCase): create_client = True topic = None zk = None server = None def setUp(self): super(KafkaIntegrationTestCase, self).setUp() if not os.environ.get('KAFKA_VERSION'): self.skipTest('Integration test requires KAFKA_VERSION') if not self.topic: topic = "%s-%s" % (self.id()[self.id().rindex(".") + 1:], random_string(10)) self.topic = topic if self.create_client: self.client = SimpleClient('%s:%d' % (self.server.host, self.server.port)) self.client.ensure_topic_exists(self.topic) self._messages = {} def tearDown(self): super(KafkaIntegrationTestCase, self).tearDown() if not os.environ.get('KAFKA_VERSION'): return if self.create_client: self.client.close() def current_offset(self, topic, partition): try: offsets, = self.client.send_offset_request([OffsetRequestPayload(topic, partition, -1, 1)]) except: # XXX: We've seen some UnknownErrors here and cant debug w/o server logs self.zk.child.dump_logs() self.server.child.dump_logs() raise else: return offsets.offsets[0] def msgs(self, iterable): return [ self.msg(x) for x in iterable ] def msg(self, s): if s not in self._messages: self._messages[s] = '%s-%s-%s' % (s, self.id(), str(uuid.uuid4())) return self._messages[s].encode('utf-8') def key(self, k): return k.encode('utf-8')
def test_ensure_topic_exists(self, decode_metadata_response, conn): mock_conn(conn) brokers = [ BrokerMetadata(0, 'broker_1', 4567, None), BrokerMetadata(1, 'broker_2', 5678, None) ] resp0_brokers = list(map(itemgetter(0, 1, 2), brokers)) topics = [ (NO_LEADER, 'topic_still_creating', []), (UNKNOWN_TOPIC_OR_PARTITION, 'topic_doesnt_exist', []), (NO_ERROR, 'topic_noleaders', [ (NO_LEADER, 0, -1, [], []), (NO_LEADER, 1, -1, [], []), ]), ] decode_metadata_response.return_value = MetadataResponse[0]( resp0_brokers, topics) client = SimpleClient(hosts=['broker_1:4567']) with self.assertRaises(UnknownTopicOrPartitionError): client.ensure_topic_exists('topic_doesnt_exist', timeout=1) with self.assertRaises(KafkaTimeoutError): client.ensure_topic_exists('topic_still_creating', timeout=1) # This should not raise client.ensure_topic_exists('topic_noleaders', timeout=1)
def test_ensure_topic_exists(self, decode_metadata_response, conn): mock_conn(conn) brokers = [ BrokerMetadata(0, 'broker_1', 4567, None), BrokerMetadata(1, 'broker_2', 5678, None) ] resp0_brokers = list(map(itemgetter(0, 1, 2), brokers)) topics = [ (NO_LEADER, 'topic_still_creating', []), (UNKNOWN_TOPIC_OR_PARTITION, 'topic_doesnt_exist', []), (NO_ERROR, 'topic_noleaders', [ (NO_LEADER, 0, -1, [], []), (NO_LEADER, 1, -1, [], []), ]), ] decode_metadata_response.return_value = MetadataResponse[0](resp0_brokers, topics) client = SimpleClient(hosts=['broker_1:4567']) with self.assertRaises(UnknownTopicOrPartitionError): client.ensure_topic_exists('topic_doesnt_exist', timeout=1) with self.assertRaises(KafkaTimeoutError): client.ensure_topic_exists('topic_still_creating', timeout=1) # This should not raise client.ensure_topic_exists('topic_noleaders', timeout=1)
def ensure_topic(): client = SimpleClient(hosts=KAFKA_SERVERS) client.ensure_topic_exists(KAFKA_TOPIC) client.close()
class TestTransSalesStreamProcessor(unittest.TestCase): """ test cases for sales stream processor (exactly once) """ server = "localhost:9092" topics = None def setUp(self): ident = datetime.now().strftime("%Y%m%d_%H%M%S") database = "test_{}".format(ident) MysqlUtils.database = os.environ.get("mysql.database", database) MysqlUtils.username = os.environ.get("mysql.username", "marker") MysqlUtils.password = os.environ.get("mysql.password", "marker-secure") MysqlUtils.host = os.environ.get("mysql.host", "localhost") MysqlUtils.port = os.environ.get("mysql.port", "3306") if self.topics is None: self.topics = "test_topic_%s" % (ident) self.client = SimpleClient(self.server) self.client.ensure_topic_exists(self.topics) self.stream_processor = TransSalesStreamProcessor( batch_duration=1, bootstrap_servers="localhost:9092", topics=[self.topics], ) with open(FIXTURE_PATH) as data: self.messages = json.load(data) def test_streaming(self): result = [()] def message_handler(data): if isinstance(data, list): result[0] = data self.stream_processor.handler = message_handler self.stream_processor.start_streaming(with_await=False) producer = KafkaProducer(bootstrap_servers=self.server, value_serializer=lambda message: json.dumps( message).encode("utf-8")) time.sleep(5) for message in self.messages: producer.send(self.topics, message) time.sleep(5) expected = {1: 1100.0, 2: 600.0} actual = { row["store_id"]: row["total_sales_price"] for row in result[0] } self.assertDictEqual(expected, actual) def tearDown(self): self.stream_processor.stop() MysqlUtils.cleanup()
# logging status of data simulation logging.basicConfig( format='%(levelname)s: %(message)s', level=logging.INFO) # establish Kafka client kafkaClient = SimpleClient(KAFKA_LISTENER) # print('KAFKACLIENT FORMAT:{}'.format(type(kafkaClient.client_id))) publisher = KafkaProducer(bootstrap_servers=KAFKA_LISTENER, client_id=kafkaClient.client_id.decode('utf-8')) # check if topic 1 and topic 2 was created, else create the topic if SENSOR_TOPIC in kafkaClient.topics: logging.info('Found kafka topic {}'.format(SENSOR_TOPIC)) else: kafkaClient.ensure_topic_exists(SENSOR_TOPIC) logging.info('Creating kafka topic {}'.format(SENSOR_TOPIC)) if LAB_TOPIC in kafkaClient.topics: logging.info('Found kafka topic {}'.format(LAB_TOPIC)) else: kafkaClient.ensure_topic_exists(LAB_TOPIC) logging.info('Creating kafka topic {}'.format(LAB_TOPIC)) if AVG_TOPIC in kafkaClient.topics: logging.info('Found kafka topic {}'.format(AVG_TOPIC)) else: kafkaClient.ensure_topic_exists(AVG_TOPIC) logging.info('Creating kafka topic {}'.format(AVG_TOPIC)) logging.info('Here are the avilable topics: {}'.format(kafkaClient.topics)) # notify about each line in the input file programStartTime = datetime.datetime.utcnow()
class TestSalesProcessor(unittest.TestCase): """ test cases sales processor """ server = "localhost:9092" topics = None def setUp(self): ident = datetime.now().strftime("%Y%m%d-%H%M%S") self.checkpoints = ".check-%s" % (ident) if self.topics is None: self.topics = "test-topic-%s" % (ident) self.client = SimpleClient(self.server) self.client.ensure_topic_exists(self.topics) self.stream_processor = StreamingProcessor( batch_duration=1, bootstrap_servers="localhost:9092", topics=[self.topics], checkpoint=self.checkpoints) with open(FIXTURE_PATH) as data: self.messages = json.load(data) def test_streaming(self): result = [None] def message_handler(rdd): data = rdd.collect() if len(data) == 1: result[0] = data[0] self.stream_processor.handler = message_handler self.stream_processor.start_streaming(with_await=False) producer = KafkaProducer(bootstrap_servers=self.server, value_serializer=lambda message: json.dumps( message).encode("utf-8")) time.sleep(5) for message in self.messages: producer.send(self.topics, message) time.sleep(5) expected = {1: 1100.0, 2: 600.0} actual = { row["store_id"]: row["total_sales_price"] for row in json.loads(result[0]) } self.assertDictEqual(expected, actual) def tearDown(self): if self.stream_processor.streaming_context: self.stream_processor.streaming_context.stop() self.stream_processor.spark_context.stop() try: shutil.rmtree(self.checkpoints) except BaseException: pass