Ejemplo n.º 1
0
    def test_read_by_offsets(self):
        offsets = kafka_get_topics_offsets('kafka.docker', self.topic)
        df = self.spark.read_ext.kafka(
            'kafka.docker',
            topic=self.topic,
            offset_ranges=offsets,
            key_deserializer=self.json_decoder,
            value_deserializer=self.json_decoder,
            schema=self.expected_data_df.schema,
        )

        self.assertDataFrameEqual(df, self.expected_data)

        self.fixture.setup_data()

        offsets = kafka_get_topics_offsets('kafka.docker', self.topic)
        df = self.spark.read_ext.kafka(
            'kafka.docker',
            topic=self.topic,
            offset_ranges=offsets,
            key_deserializer=self.json_decoder,
            value_deserializer=self.json_decoder,
            schema=self.expected_data_df.schema,
        )

        self.assertDataFrameEqual(df, self.expected_data * 2)
Ejemplo n.º 2
0
 def __enter__(self):
     self._df = None
     self.count = 0
     self.pre_offsets = kafka_get_topics_offsets(
         topic=self.topic,
         host=self.host,
         port=self.port,
     )
Ejemplo n.º 3
0
 def __exit__(self, e_type, e_value, e_trace):
     self.post_offsets = kafka_get_topics_offsets(
         topic=self.topic,
         host=self.host,
         port=self.port,
     )
     self.count = sum([
         post[2] - pre[2]
         for pre, post in zip(self.pre_offsets, self.post_offsets)
     ])
Ejemplo n.º 4
0
    def kafka(self,
              host,
              topic,
              offset_ranges=None,
              key_deserializer=None,
              value_deserializer=None,
              schema=None,
              port=9092,
              parallelism=None,
              options=None):
        """Creates dataframe from specified set of messages from Kafka topic.

        Defining ranges:
            - If `offset_ranges` is specified it defines which specific range to read.
            - If `offset_ranges` is omitted it will auto-discover it's partitions.

        The `schema` parameter, if specified, should contain two top level fields:
        `key` and `value`.

        Parameters `key_deserializer` and `value_deserializer` are callables
        which get bytes as input and should return python structures as output.

        Args:
            host (str): Kafka host.
            topic (str|None): Kafka topic to read from.
            offset_ranges (list[(int, int, int)]|None): List of partition ranges
                [(partition, start_offset, end_offset)].
            key_deserializer (function): Function used to deserialize the key.
            value_deserializer (function): Function used to deserialize the value.
            schema (pyspark.sql.types.StructType): Schema to apply to create a Dataframe.
            port (int): Kafka port.
            parallelism (int|None): The max number of parallel tasks that could be executed
                during the read stage (see :ref:`controlling-the-load`).
            options (dict|None): Additional kafka parameters, see KafkaUtils.createRDD docs.

        Returns:
            pyspark.sql.DataFrame

        Raises:
            InvalidArgumentError
        """
        assert self._spark.has_package('org.apache.spark:spark-streaming-kafka')

        if not key_deserializer or not value_deserializer or not schema:
            raise InvalidArgumentError('You should specify all of parameters:'
                                       '`key_deserializer`, `value_deserializer` and `schema`')

        kafka_params = {
            'metadata.broker.list': '{}:{}'.format(host, port),
        }

        if options:
            kafka_params.update(options)

        if not offset_ranges:
            offset_ranges = kafka_get_topics_offsets(host, topic, port)

        offset_ranges = [OffsetRange(topic, partition, start_offset, end_offset)
                         for partition, start_offset, end_offset in offset_ranges]

        rdd = KafkaUtils.createRDD(self._spark.sparkContext,
                                   kafkaParams=kafka_params,
                                   offsetRanges=offset_ranges or [],
                                   keyDecoder=key_deserializer,
                                   valueDecoder=value_deserializer,
                                   )

        if parallelism:
            rdd = rdd.coalesce(parallelism)

        return self._spark.createDataFrame(rdd, schema=schema)