Example #1
0
    def __init__(
        self,
        spark,
        df_schema,
        key_deserializer,
        value_deserializer,
        host,
        topic,
        port=9092,
    ):
        """Initialize context manager

        Parameters `key_deserializer` and `value_deserializer` are callables
        which get bytes as input and should return python structures as output.

        Args:
            spark (SparklySession): currently active SparklySession
            df_schema (pyspark.sql.types.StructType): schema of dataframe to be generated
            key_deserializer (function): function used to deserialize the key
            value_deserializer (function): function used to deserialize the value
            host (basestring): host or ip address of the kafka server to connect to
            topic (basestring): Kafka topic to monitor
            port (int): port number of the Kafka server to connect to
        """
        self.spark = spark
        self.topic = topic
        self.df_schema = df_schema
        self.key_deser, self.val_deser = key_deserializer, value_deserializer
        self.host, self.port = host, port
        self._df = None
        self.count = 0

        kafka_client = SimpleClient(host)
        kafka_client.ensure_topic_exists(topic)
Example #2
0
class KafkaIntegrationTestCase(unittest.TestCase):
    create_client = True
    topic = None
    zk = None
    server = None

    def setUp(self):
        super(KafkaIntegrationTestCase, self).setUp()
        if not os.environ.get('KAFKA_VERSION'):
            self.skipTest('Integration test requires KAFKA_VERSION')

        if not self.topic:
            topic = "%s-%s" % (self.id()[self.id().rindex(".") + 1:],
                               random_string(10))
            self.topic = topic

        if self.create_client:
            self.client = SimpleClient('%s:%d' %
                                       (self.server.host, self.server.port))
            self.client_async = KafkaClient(
                bootstrap_servers='%s:%d' %
                (self.server.host, self.server.port))

        self.client.ensure_topic_exists(self.topic)

        self._messages = {}

    def tearDown(self):
        super(KafkaIntegrationTestCase, self).tearDown()
        if not os.environ.get('KAFKA_VERSION'):
            return

        if self.create_client:
            self.client.close()

    def current_offset(self, topic, partition):
        try:
            offsets, = self.client.send_offset_request(
                [OffsetRequestPayload(topic, partition, -1, 1)])
        except:
            # XXX: We've seen some UnknownErrors here and can't debug w/o server logs
            self.zk.child.dump_logs()
            self.server.child.dump_logs()
            raise
        else:
            return offsets.offsets[0]

    def msgs(self, iterable):
        return [self.msg(x) for x in iterable]

    def msg(self, s):
        if s not in self._messages:
            self._messages[s] = '%s-%s-%s' % (s, self.id(), str(uuid.uuid4()))

        return self._messages[s].encode('utf-8')

    def key(self, k):
        return k.encode('utf-8')
Example #3
0
def send_to_kafka(message):
    producer = get_producer()
    try:
        producer.send(settings.KAFKA_TOPIC, message)
    except:
        client = SimpleClient(hosts=settings.KAFKA_SERVERS)
        client.ensure_topic_exists(settings.KAFKA_TOPIC)
        client.close()
        producer.send(settings.KAFKA_TOPIC, message)
    producer.close(10)
Example #4
0
class KafkaIntegrationTestCase(unittest.TestCase):
    create_client = True
    topic = None
    zk = None
    server = None

    def setUp(self):
        super(KafkaIntegrationTestCase, self).setUp()
        if not os.environ.get('KAFKA_VERSION'):
            self.skipTest('Integration test requires KAFKA_VERSION')

        if not self.topic:
            topic = "%s-%s" % (self.id()[self.id().rindex(".") + 1:], random_string(10))
            self.topic = topic

        if self.create_client:
            self.client = SimpleClient('%s:%d' % (self.server.host, self.server.port))

        self.client.ensure_topic_exists(self.topic)

        self._messages = {}

    def tearDown(self):
        super(KafkaIntegrationTestCase, self).tearDown()
        if not os.environ.get('KAFKA_VERSION'):
            return

        if self.create_client:
            self.client.close()

    def current_offset(self, topic, partition):
        try:
            offsets, = self.client.send_offset_request([OffsetRequestPayload(topic, partition, -1, 1)])
        except:
            # XXX: We've seen some UnknownErrors here and cant debug w/o server logs
            self.zk.child.dump_logs()
            self.server.child.dump_logs()
            raise
        else:
            return offsets.offsets[0]

    def msgs(self, iterable):
        return [ self.msg(x) for x in iterable ]

    def msg(self, s):
        if s not in self._messages:
            self._messages[s] = '%s-%s-%s' % (s, self.id(), str(uuid.uuid4()))

        return self._messages[s].encode('utf-8')

    def key(self, k):
        return k.encode('utf-8')
Example #5
0
    def test_ensure_topic_exists(self, decode_metadata_response, conn):

        mock_conn(conn)

        brokers = [
            BrokerMetadata(0, 'broker_1', 4567, None),
            BrokerMetadata(1, 'broker_2', 5678, None)
        ]
        resp0_brokers = list(map(itemgetter(0, 1, 2), brokers))

        topics = [
            (NO_LEADER, 'topic_still_creating', []),
            (UNKNOWN_TOPIC_OR_PARTITION, 'topic_doesnt_exist', []),
            (NO_ERROR, 'topic_noleaders', [
                (NO_LEADER, 0, -1, [], []),
                (NO_LEADER, 1, -1, [], []),
            ]),
        ]
        decode_metadata_response.return_value = MetadataResponse[0](
            resp0_brokers, topics)

        client = SimpleClient(hosts=['broker_1:4567'])

        with self.assertRaises(UnknownTopicOrPartitionError):
            client.ensure_topic_exists('topic_doesnt_exist', timeout=1)

        with self.assertRaises(KafkaTimeoutError):
            client.ensure_topic_exists('topic_still_creating', timeout=1)

        # This should not raise
        client.ensure_topic_exists('topic_noleaders', timeout=1)
Example #6
0
    def test_ensure_topic_exists(self, decode_metadata_response, conn):

        mock_conn(conn)

        brokers = [
            BrokerMetadata(0, 'broker_1', 4567, None),
            BrokerMetadata(1, 'broker_2', 5678, None)
        ]
        resp0_brokers = list(map(itemgetter(0, 1, 2), brokers))

        topics = [
            (NO_LEADER, 'topic_still_creating', []),
            (UNKNOWN_TOPIC_OR_PARTITION, 'topic_doesnt_exist', []),
            (NO_ERROR, 'topic_noleaders', [
                (NO_LEADER, 0, -1, [], []),
                (NO_LEADER, 1, -1, [], []),
            ]),
        ]
        decode_metadata_response.return_value = MetadataResponse[0](resp0_brokers, topics)

        client = SimpleClient(hosts=['broker_1:4567'])

        with self.assertRaises(UnknownTopicOrPartitionError):
            client.ensure_topic_exists('topic_doesnt_exist', timeout=1)

        with self.assertRaises(KafkaTimeoutError):
            client.ensure_topic_exists('topic_still_creating', timeout=1)

        # This should not raise
        client.ensure_topic_exists('topic_noleaders', timeout=1)
Example #7
0
def ensure_topic():
    client = SimpleClient(hosts=KAFKA_SERVERS)
    client.ensure_topic_exists(KAFKA_TOPIC)
    client.close()
Example #8
0
class TestTransSalesStreamProcessor(unittest.TestCase):
    """
    test cases for sales stream processor (exactly once)
    """
    server = "localhost:9092"
    topics = None

    def setUp(self):

        ident = datetime.now().strftime("%Y%m%d_%H%M%S")
        database = "test_{}".format(ident)

        MysqlUtils.database = os.environ.get("mysql.database", database)
        MysqlUtils.username = os.environ.get("mysql.username", "marker")
        MysqlUtils.password = os.environ.get("mysql.password", "marker-secure")
        MysqlUtils.host = os.environ.get("mysql.host", "localhost")
        MysqlUtils.port = os.environ.get("mysql.port", "3306")

        if self.topics is None:
            self.topics = "test_topic_%s" % (ident)

        self.client = SimpleClient(self.server)
        self.client.ensure_topic_exists(self.topics)

        self.stream_processor = TransSalesStreamProcessor(
            batch_duration=1,
            bootstrap_servers="localhost:9092",
            topics=[self.topics],
        )

        with open(FIXTURE_PATH) as data:
            self.messages = json.load(data)

    def test_streaming(self):
        result = [()]

        def message_handler(data):
            if isinstance(data, list):
                result[0] = data

        self.stream_processor.handler = message_handler

        self.stream_processor.start_streaming(with_await=False)

        producer = KafkaProducer(bootstrap_servers=self.server,
                                 value_serializer=lambda message: json.dumps(
                                     message).encode("utf-8"))

        time.sleep(5)

        for message in self.messages:
            producer.send(self.topics, message)

        time.sleep(5)

        expected = {1: 1100.0, 2: 600.0}

        actual = {
            row["store_id"]: row["total_sales_price"]
            for row in result[0]
        }

        self.assertDictEqual(expected, actual)

    def tearDown(self):
        self.stream_processor.stop()
        MysqlUtils.cleanup()
    # logging status of data simulation
    logging.basicConfig(
        format='%(levelname)s: %(message)s', level=logging.INFO)

    # establish Kafka client
    kafkaClient = SimpleClient(KAFKA_LISTENER)
    # print('KAFKACLIENT FORMAT:{}'.format(type(kafkaClient.client_id)))
    publisher = KafkaProducer(bootstrap_servers=KAFKA_LISTENER,
                              client_id=kafkaClient.client_id.decode('utf-8'))

    # check if topic 1 and topic 2 was created, else create the topic
    if SENSOR_TOPIC in kafkaClient.topics:
        logging.info('Found kafka topic {}'.format(SENSOR_TOPIC))
    else:
        kafkaClient.ensure_topic_exists(SENSOR_TOPIC)
        logging.info('Creating kafka topic {}'.format(SENSOR_TOPIC))
    if LAB_TOPIC in kafkaClient.topics:
        logging.info('Found kafka topic {}'.format(LAB_TOPIC))
    else:
        kafkaClient.ensure_topic_exists(LAB_TOPIC)
        logging.info('Creating kafka topic {}'.format(LAB_TOPIC))
    if AVG_TOPIC in kafkaClient.topics:
        logging.info('Found kafka topic {}'.format(AVG_TOPIC))
    else:
        kafkaClient.ensure_topic_exists(AVG_TOPIC)
        logging.info('Creating kafka topic {}'.format(AVG_TOPIC))
    logging.info('Here are the avilable topics: {}'.format(kafkaClient.topics))

    # notify about each line in the input file
    programStartTime = datetime.datetime.utcnow()
Example #10
0
class TestSalesProcessor(unittest.TestCase):
    """
    test cases sales processor
    """
    server = "localhost:9092"
    topics = None

    def setUp(self):

        ident = datetime.now().strftime("%Y%m%d-%H%M%S")
        self.checkpoints = ".check-%s" % (ident)
        if self.topics is None:
            self.topics = "test-topic-%s" % (ident)

        self.client = SimpleClient(self.server)
        self.client.ensure_topic_exists(self.topics)

        self.stream_processor = StreamingProcessor(
            batch_duration=1,
            bootstrap_servers="localhost:9092",
            topics=[self.topics],
            checkpoint=self.checkpoints)

        with open(FIXTURE_PATH) as data:
            self.messages = json.load(data)

    def test_streaming(self):
        result = [None]

        def message_handler(rdd):
            data = rdd.collect()
            if len(data) == 1:
                result[0] = data[0]

        self.stream_processor.handler = message_handler

        self.stream_processor.start_streaming(with_await=False)

        producer = KafkaProducer(bootstrap_servers=self.server,
                                 value_serializer=lambda message: json.dumps(
                                     message).encode("utf-8"))

        time.sleep(5)

        for message in self.messages:
            producer.send(self.topics, message)

        time.sleep(5)

        expected = {1: 1100.0, 2: 600.0}
        actual = {
            row["store_id"]: row["total_sales_price"]
            for row in json.loads(result[0])
        }

        self.assertDictEqual(expected, actual)

    def tearDown(self):
        if self.stream_processor.streaming_context:
            self.stream_processor.streaming_context.stop()
        self.stream_processor.spark_context.stop()
        try:
            shutil.rmtree(self.checkpoints)
        except BaseException:
            pass