Esempio n. 1
0
File: kafka.py Progetto: 31z4/spark
    def createStream(ssc, zkQuorum, groupId, topics, kafkaParams={},
                     storageLevel=StorageLevel.MEMORY_AND_DISK_SER_2,
                     keyDecoder=utf8_decoder, valueDecoder=utf8_decoder):
        """
        Create an input stream that pulls messages from a Kafka Broker.

        :param ssc:  StreamingContext object
        :param zkQuorum:  Zookeeper quorum (hostname:port,hostname:port,..).
        :param groupId:  The group id for this consumer.
        :param topics:  Dict of (topic_name -> numPartitions) to consume.
                        Each partition is consumed in its own thread.
        :param kafkaParams: Additional params for Kafka
        :param storageLevel:  RDD storage level.
        :param keyDecoder:  A function used to decode key (default is utf8_decoder)
        :param valueDecoder:  A function used to decode value (default is utf8_decoder)
        :return: A DStream object
        """
        kafkaParams.update({
            "zookeeper.connect": zkQuorum,
            "group.id": groupId,
            "zookeeper.connection.timeout.ms": "10000",
        })
        if not isinstance(topics, dict):
            raise TypeError("topics should be dict")
        jtopics = MapConverter().convert(topics, ssc.sparkContext._gateway._gateway_client)
        jparam = MapConverter().convert(kafkaParams, ssc.sparkContext._gateway._gateway_client)
        jlevel = ssc._sc._getJavaStorageLevel(storageLevel)

        try:
            # Use KafkaUtilsPythonHelper to access Scala's KafkaUtils (see SPARK-6027)
            helperClass = ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader()\
                .loadClass("org.apache.spark.streaming.kafka.KafkaUtilsPythonHelper")
            helper = helperClass.newInstance()
            jstream = helper.createStream(ssc._jssc, jparam, jtopics, jlevel)
        except Py4JJavaError as e:
            # TODO: use --jar once it also work on driver
            if 'ClassNotFoundException' in str(e.java_exception):
                print("""
________________________________________________________________________________________________

  Spark Streaming's Kafka libraries not found in class path. Try one of the following.

  1. Include the Kafka library and its dependencies with in the
     spark-submit command as

     $ bin/spark-submit --packages org.apache.spark:spark-streaming-kafka:%s ...

  2. Download the JAR of the artifact from Maven Central http://search.maven.org/,
     Group Id = org.apache.spark, Artifact Id = spark-streaming-kafka-assembly, Version = %s.
     Then, include the jar in the spark-submit command as

     $ bin/spark-submit --jars <spark-streaming-kafka-assembly.jar> ...

________________________________________________________________________________________________

""" % (ssc.sparkContext.version, ssc.sparkContext.version))
            raise e
        ser = PairDeserializer(NoOpSerializer(), NoOpSerializer())
        stream = DStream(jstream, ssc, ser)
        return stream.map(lambda k_v: (keyDecoder(k_v[0]), valueDecoder(k_v[1])))
Esempio n. 2
0
    def createStream(ssc, zkQuorum, groupId, topics, kafkaParams=None,
                     storageLevel=StorageLevel.MEMORY_AND_DISK_2,
                     keyDecoder=utf8_decoder, valueDecoder=utf8_decoder):
        """
        Create an input stream that pulls messages from a Kafka Broker.

        :param ssc:  StreamingContext object
        :param zkQuorum:  Zookeeper quorum (hostname:port,hostname:port,..).
        :param groupId:  The group id for this consumer.
        :param topics:  Dict of (topic_name -> numPartitions) to consume.
                        Each partition is consumed in its own thread.
        :param kafkaParams: Additional params for Kafka
        :param storageLevel:  RDD storage level.
        :param keyDecoder:  A function used to decode key (default is utf8_decoder)
        :param valueDecoder:  A function used to decode value (default is utf8_decoder)
        :return: A DStream object
        """
        if kafkaParams is None:
            kafkaParams = dict()
        kafkaParams.update({
            "zookeeper.connect": zkQuorum,
            "group.id": groupId,
            "zookeeper.connection.timeout.ms": "10000",
        })
        if not isinstance(topics, dict):
            raise TypeError("topics should be dict")
        jlevel = ssc._sc._getJavaStorageLevel(storageLevel)
        helper = KafkaUtils._get_helper(ssc._sc)
        jstream = helper.createStream(ssc._jssc, kafkaParams, topics, jlevel)
        ser = PairDeserializer(NoOpSerializer(), NoOpSerializer())
        stream = DStream(jstream, ssc, ser)
        return stream.map(lambda k_v: (keyDecoder(k_v[0]), valueDecoder(k_v[1])))
Esempio n. 3
0
    def createStream(ssc,
                     kinesisAppName,
                     streamName,
                     endpointUrl,
                     regionName,
                     initialPositionInStream,
                     checkpointInterval,
                     storageLevel=StorageLevel.MEMORY_AND_DISK_2,
                     awsAccessKeyId=None,
                     awsSecretKey=None,
                     decoder=utf8_decoder):
        """
        Create an input stream that pulls messages from a Kinesis stream. This uses the
        Kinesis Client Library (KCL) to pull messages from Kinesis.

        Note: The given AWS credentials will get saved in DStream checkpoints if checkpointing is
        enabled. Make sure that your checkpoint directory is secure.

        :param ssc:  StreamingContext object
        :param kinesisAppName:  Kinesis application name used by the Kinesis Client Library (KCL) to
                                update DynamoDB
        :param streamName:  Kinesis stream name
        :param endpointUrl:  Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com)
        :param regionName:  Name of region used by the Kinesis Client Library (KCL) to update
                            DynamoDB (lease coordination and checkpointing) and CloudWatch (metrics)
        :param initialPositionInStream:  In the absence of Kinesis checkpoint info, this is the
                                         worker's initial starting position in the stream. The
                                         values are either the beginning of the stream per Kinesis'
                                         limit of 24 hours (InitialPositionInStream.TRIM_HORIZON) or
                                         the tip of the stream (InitialPositionInStream.LATEST).
        :param checkpointInterval:  Checkpoint interval for Kinesis checkpointing. See the Kinesis
                                    Spark Streaming documentation for more details on the different
                                    types of checkpoints.
        :param storageLevel:  Storage level to use for storing the received objects (default is
                              StorageLevel.MEMORY_AND_DISK_2)
        :param awsAccessKeyId:  AWS AccessKeyId (default is None. If None, will use
                                DefaultAWSCredentialsProviderChain)
        :param awsSecretKey:  AWS SecretKey (default is None. If None, will use
                              DefaultAWSCredentialsProviderChain)
        :param decoder:  A function used to decode value (default is utf8_decoder)
        :return: A DStream object
        """
        jlevel = ssc._sc._getJavaStorageLevel(storageLevel)
        jduration = ssc._jduration(checkpointInterval)

        try:
            # Use KinesisUtilsPythonHelper to access Scala's KinesisUtils
            helperClass = ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader()\
                .loadClass("org.apache.spark.streaming.kinesis.KinesisUtilsPythonHelper")
            helper = helperClass.newInstance()
            jstream = helper.createStream(ssc._jssc, kinesisAppName,
                                          streamName, endpointUrl, regionName,
                                          initialPositionInStream, jduration,
                                          jlevel, awsAccessKeyId, awsSecretKey)
        except Py4JJavaError as e:
            if 'ClassNotFoundException' in str(e.java_exception):
                KinesisUtils._printErrorMsg(ssc.sparkContext)
            raise e
        stream = DStream(jstream, ssc, NoOpSerializer())
        return stream.map(lambda v: decoder(v))
Esempio n. 4
0
    def createStream(ssc, kinesisAppName, streamName, endpointUrl, regionName,
                     initialPositionInStream, checkpointInterval,
                     storageLevel=StorageLevel.MEMORY_AND_DISK_2,
                     awsAccessKeyId=None, awsSecretKey=None, decoder=utf8_decoder,
                     stsAssumeRoleArn=None, stsSessionName=None, stsExternalId=None):
        """
        Create an input stream that pulls messages from a Kinesis stream. This uses the
        Kinesis Client Library (KCL) to pull messages from Kinesis.

        .. note:: The given AWS credentials will get saved in DStream checkpoints if checkpointing
            is enabled. Make sure that your checkpoint directory is secure.

        :param ssc:  StreamingContext object
        :param kinesisAppName:  Kinesis application name used by the Kinesis Client Library (KCL) to
                                update DynamoDB
        :param streamName:  Kinesis stream name
        :param endpointUrl:  Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com)
        :param regionName:  Name of region used by the Kinesis Client Library (KCL) to update
                            DynamoDB (lease coordination and checkpointing) and CloudWatch (metrics)
        :param initialPositionInStream:  In the absence of Kinesis checkpoint info, this is the
                                         worker's initial starting position in the stream. The
                                         values are either the beginning of the stream per Kinesis'
                                         limit of 24 hours (InitialPositionInStream.TRIM_HORIZON) or
                                         the tip of the stream (InitialPositionInStream.LATEST).
        :param checkpointInterval:  Checkpoint interval for Kinesis checkpointing. See the Kinesis
                                    Spark Streaming documentation for more details on the different
                                    types of checkpoints.
        :param storageLevel:  Storage level to use for storing the received objects (default is
                              StorageLevel.MEMORY_AND_DISK_2)
        :param awsAccessKeyId:  AWS AccessKeyId (default is None. If None, will use
                                DefaultAWSCredentialsProviderChain)
        :param awsSecretKey:  AWS SecretKey (default is None. If None, will use
                              DefaultAWSCredentialsProviderChain)
        :param decoder:  A function used to decode value (default is utf8_decoder)
        :param stsAssumeRoleArn: ARN of IAM role to assume when using STS sessions to read from
                                 the Kinesis stream (default is None).
        :param stsSessionName: Name to uniquely identify STS sessions used to read from Kinesis
                               stream, if STS is being used (default is None).
        :param stsExternalId: External ID that can be used to validate against the assumed IAM
                              role's trust policy, if STS is being used (default is None).
        :return: A DStream object
        """
        jlevel = ssc._sc._getJavaStorageLevel(storageLevel)
        jduration = ssc._jduration(checkpointInterval)

        try:
            # Use KinesisUtilsPythonHelper to access Scala's KinesisUtils
            helper = ssc._jvm.org.apache.spark.streaming.kinesis.KinesisUtilsPythonHelper()
        except TypeError as e:
            if str(e) == "'JavaPackage' object is not callable":
                KinesisUtils._printErrorMsg(ssc.sparkContext)
            raise
        jstream = helper.createStream(ssc._jssc, kinesisAppName, streamName, endpointUrl,
                                      regionName, initialPositionInStream, jduration, jlevel,
                                      awsAccessKeyId, awsSecretKey, stsAssumeRoleArn,
                                      stsSessionName, stsExternalId)
        stream = DStream(jstream, ssc, NoOpSerializer())
        return stream.map(lambda v: decoder(v))
Esempio n. 5
0
    def createDirectStream(ssc,
                           topics,
                           kafkaParams,
                           fromOffsets=None,
                           keyDecoder=utf8_decoder,
                           valueDecoder=utf8_decoder):
        """
        .. note:: Experimental

        Create an input stream that directly pulls messages from a Kafka Broker and specific offset.

        This is not a receiver based Kafka input stream, it directly pulls the message from Kafka
        in each batch duration and processed without storing.

        This does not use Zookeeper to store offsets. The consumed offsets are tracked
        by the stream itself. For interoperability with Kafka monitoring tools that depend on
        Zookeeper, you have to update Kafka/Zookeeper yourself from the streaming application.
        You can access the offsets used in each batch from the generated RDDs (see

        To recover from driver failures, you have to enable checkpointing in the StreamingContext.
        The information on consumed offset can be recovered from the checkpoint.
        See the programming guide for details (constraints, etc.).

        :param ssc:  StreamingContext object.
        :param topics:  list of topic_name to consume.
        :param kafkaParams: Additional params for Kafka.
        :param fromOffsets: Per-topic/partition Kafka offsets defining the (inclusive) starting
                            point of the stream.
        :param keyDecoder:  A function used to decode key (default is utf8_decoder).
        :param valueDecoder:  A function used to decode value (default is utf8_decoder).
        :return: A DStream object
        """
        if fromOffsets is None:
            fromOffsets = dict()
        if not isinstance(topics, list):
            raise TypeError("topics should be list")
        if not isinstance(kafkaParams, dict):
            raise TypeError("kafkaParams should be dict")

        try:
            helperClass = ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \
                .loadClass("org.apache.spark.streaming.kafka.KafkaUtilsPythonHelper")
            helper = helperClass.newInstance()

            jfromOffsets = dict([(k._jTopicAndPartition(helper), v)
                                 for (k, v) in fromOffsets.items()])
            jstream = helper.createDirectStream(ssc._jssc, kafkaParams,
                                                set(topics), jfromOffsets)
        except Py4JJavaError as e:
            if 'ClassNotFoundException' in str(e.java_exception):
                KafkaUtils._printErrorMsg(ssc.sparkContext)
            raise e

        ser = PairDeserializer(NoOpSerializer(), NoOpSerializer())
        stream = DStream(jstream, ssc, ser)
        return stream.map(lambda k_v:
                          (keyDecoder(k_v[0]), valueDecoder(k_v[1])))
Esempio n. 6
0
    def createStream(ssc, kinesisAppName, streamName, endpointUrl, regionName,
                     initialPositionInStream, checkpointInterval,
                     storageLevel=StorageLevel.MEMORY_AND_DISK_2,
                     awsAccessKeyId=None, awsSecretKey=None, decoder=utf8_decoder):
        """
        Create an input stream that pulls messages from a Kinesis stream. This uses the
        Kinesis Client Library (KCL) to pull messages from Kinesis.

        Note: The given AWS credentials will get saved in DStream checkpoints if checkpointing is
        enabled. Make sure that your checkpoint directory is secure.

        :param ssc:  StreamingContext object
        :param kinesisAppName:  Kinesis application name used by the Kinesis Client Library (KCL) to
                                update DynamoDB
        :param streamName:  Kinesis stream name
        :param endpointUrl:  Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com)
        :param regionName:  Name of region used by the Kinesis Client Library (KCL) to update
                            DynamoDB (lease coordination and checkpointing) and CloudWatch (metrics)
        :param initialPositionInStream:  In the absence of Kinesis checkpoint info, this is the
                                         worker's initial starting position in the stream. The
                                         values are either the beginning of the stream per Kinesis'
                                         limit of 24 hours (InitialPositionInStream.TRIM_HORIZON) or
                                         the tip of the stream (InitialPositionInStream.LATEST).
        :param checkpointInterval:  Checkpoint interval for Kinesis checkpointing. See the Kinesis
                                    Spark Streaming documentation for more details on the different
                                    types of checkpoints.
        :param storageLevel:  Storage level to use for storing the received objects (default is
                              StorageLevel.MEMORY_AND_DISK_2)
        :param awsAccessKeyId:  AWS AccessKeyId (default is None. If None, will use
                                DefaultAWSCredentialsProviderChain)
        :param awsSecretKey:  AWS SecretKey (default is None. If None, will use
                              DefaultAWSCredentialsProviderChain)
        :param decoder:  A function used to decode value (default is utf8_decoder)
        :return: A DStream object
        """
        jlevel = ssc._sc._getJavaStorageLevel(storageLevel)
        jduration = ssc._jduration(checkpointInterval)

        try:
            # Use KinesisUtilsPythonHelper to access Scala's KinesisUtils
            helperClass = ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader()\
                .loadClass("org.apache.spark.streaming.kinesis.KinesisUtilsPythonHelper")
            helper = helperClass.newInstance()
            jstream = helper.createStream(ssc._jssc, kinesisAppName, streamName, endpointUrl,
                                          regionName, initialPositionInStream, jduration, jlevel,
                                          awsAccessKeyId, awsSecretKey)
        except Py4JJavaError as e:
            if 'ClassNotFoundException' in str(e.java_exception):
                KinesisUtils._printErrorMsg(ssc.sparkContext)
            raise e
        stream = DStream(jstream, ssc, NoOpSerializer())
        return stream.map(lambda v: decoder(v))
Esempio n. 7
0
    def createDirectStream(ssc, topics, kafkaParams, fromOffsets={},
                           keyDecoder=utf8_decoder, valueDecoder=utf8_decoder):
        """
        .. note:: Experimental

        Create an input stream that directly pulls messages from a Kafka Broker and specific offset.

        This is not a receiver based Kafka input stream, it directly pulls the message from Kafka
        in each batch duration and processed without storing.

        This does not use Zookeeper to store offsets. The consumed offsets are tracked
        by the stream itself. For interoperability with Kafka monitoring tools that depend on
        Zookeeper, you have to update Kafka/Zookeeper yourself from the streaming application.
        You can access the offsets used in each batch from the generated RDDs (see

        To recover from driver failures, you have to enable checkpointing in the StreamingContext.
        The information on consumed offset can be recovered from the checkpoint.
        See the programming guide for details (constraints, etc.).

        :param ssc:  StreamingContext object.
        :param topics:  list of topic_name to consume.
        :param kafkaParams: Additional params for Kafka.
        :param fromOffsets: Per-topic/partition Kafka offsets defining the (inclusive) starting
                            point of the stream.
        :param keyDecoder:  A function used to decode key (default is utf8_decoder).
        :param valueDecoder:  A function used to decode value (default is utf8_decoder).
        :return: A DStream object
        """
        if not isinstance(topics, list):
            raise TypeError("topics should be list")
        if not isinstance(kafkaParams, dict):
            raise TypeError("kafkaParams should be dict")

        try:
            helperClass = ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \
                .loadClass("org.apache.spark.streaming.kafka.KafkaUtilsPythonHelper")
            helper = helperClass.newInstance()

            jfromOffsets = dict([(k._jTopicAndPartition(helper),
                                  v) for (k, v) in fromOffsets.items()])
            jstream = helper.createDirectStream(ssc._jssc, kafkaParams, set(topics), jfromOffsets)
        except Py4JJavaError as e:
            if 'ClassNotFoundException' in str(e.java_exception):
                KafkaUtils._printErrorMsg(ssc.sparkContext)
            raise e

        ser = PairDeserializer(NoOpSerializer(), NoOpSerializer())
        stream = DStream(jstream, ssc, ser)
        return stream.map(lambda k_v: (keyDecoder(k_v[0]), valueDecoder(k_v[1])))
Esempio n. 8
0
    def createStream(ssc,
                     zkQuorum,
                     groupId,
                     topics,
                     kafkaParams=None,
                     storageLevel=StorageLevel.MEMORY_AND_DISK_SER_2,
                     keyDecoder=utf8_decoder,
                     valueDecoder=utf8_decoder):
        """
        Create an input stream that pulls messages from a Kafka Broker.

        :param ssc:  StreamingContext object
        :param zkQuorum:  Zookeeper quorum (hostname:port,hostname:port,..).
        :param groupId:  The group id for this consumer.
        :param topics:  Dict of (topic_name -> numPartitions) to consume.
                        Each partition is consumed in its own thread.
        :param kafkaParams: Additional params for Kafka
        :param storageLevel:  RDD storage level.
        :param keyDecoder:  A function used to decode key (default is utf8_decoder)
        :param valueDecoder:  A function used to decode value (default is utf8_decoder)
        :return: A DStream object
        """
        if kafkaParams is None:
            kafkaParams = dict()
        kafkaParams.update({
            "zookeeper.connect": zkQuorum,
            "group.id": groupId,
            "zookeeper.connection.timeout.ms": "10000",
        })
        if not isinstance(topics, dict):
            raise TypeError("topics should be dict")
        jlevel = ssc._sc._getJavaStorageLevel(storageLevel)

        try:
            # Use KafkaUtilsPythonHelper to access Scala's KafkaUtils (see SPARK-6027)
            helperClass = ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader()\
                .loadClass("org.apache.spark.streaming.kafka.KafkaUtilsPythonHelper")
            helper = helperClass.newInstance()
            jstream = helper.createStream(ssc._jssc, kafkaParams, topics,
                                          jlevel)
        except Py4JJavaError as e:
            # TODO: use --jar once it also work on driver
            if 'ClassNotFoundException' in str(e.java_exception):
                KafkaUtils._printErrorMsg(ssc.sparkContext)
            raise e
        ser = PairDeserializer(NoOpSerializer(), NoOpSerializer())
        stream = DStream(jstream, ssc, ser)
        return stream.map(lambda k_v:
                          (keyDecoder(k_v[0]), valueDecoder(k_v[1])))
Esempio n. 9
0
    def _toPythonDStream(ssc, jstream, bodyDecoder):
        ser = PairDeserializer(NoOpSerializer(), NoOpSerializer())
        stream = DStream(jstream, ssc, ser)

        def func(event):
            headersBytes = BytesIO(event[0]) if sys.version >= "3" else StringIO(event[0])
            headers = {}
            strSer = UTF8Deserializer()
            for i in range(0, read_int(headersBytes)):
                key = strSer.loads(headersBytes)
                value = strSer.loads(headersBytes)
                headers[key] = value
            body = bodyDecoder(event[1])
            return (headers, body)
        return stream.map(func)
Esempio n. 10
0
    def _toPythonDStream(ssc, jstream, bodyDecoder):
        ser = PairDeserializer(NoOpSerializer(), NoOpSerializer())
        stream = DStream(jstream, ssc, ser)

        def func(event):
            headersBytes = BytesIO(event[0]) if sys.version >= "3" else StringIO(event[0])
            headers = {}
            strSer = UTF8Deserializer()
            for i in range(0, read_int(headersBytes)):
                key = strSer.loads(headersBytes)
                value = strSer.loads(headersBytes)
                headers[key] = value
            body = bodyDecoder(event[1])
            return (headers, body)
        return stream.map(func)
Esempio n. 11
0
    def createStreams(ssc,
                      consumerId,
                      topic,
                      tags,
                      accessKeyId,
                      accessKeySecret,
                      storageLevel=StorageLevel.MEMORY_AND_DISK_SER_2):
        """
        Create an input stream that pulls message from a Aliyun ONS stream.
        :param ssc: StreamingContext object.
        :param consumerId: Name of a set of consumers.
        :param topic: Which topic to subscribe.
        :param tags: Which tag to subscribe.
        :param accessKeyId: Aliyun Access Key ID.
        :param accessKeySecret: Aliyun Access Key Secret.
        :param storageLevel: RDD storage level.
        :return: A DStream object.
        """

        try:
            helperClass = ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \
                .loadClass("org.apache.spark.streaming.aliyun.ons.OnsUtilsHelper")
            helper = helperClass.newInstance()
            jlevel = ssc._sc._getJavaStorageLevel(storageLevel)
            jstream = helper.createDefaultStreams(ssc._jssc, consumerId, topic,
                                                  tags, accessKeyId,
                                                  accessKeySecret, jlevel)

        except Py4JJavaError as e:
            # TODO: use --jar once it also work on driver
            if 'ClassNotFoundException' in str(e.java_exception):
                OnsUtils._printErrorMsg()
            raise e
        return DStream(jstream, ssc, UTF8Deserializer())
Esempio n. 12
0
    def createStream(ssc,
                     brokerUrl,
                     topic,
                     storageLevel=StorageLevel.MEMORY_AND_DISK_2):
        """
        Create an input stream that pulls messages from a Mqtt Broker.

        :param ssc:  StreamingContext object
        :param brokerUrl:  Url of remote mqtt publisher
        :param topic:  topic name to subscribe to
        :param storageLevel:  RDD storage level.
        :return: A DStream object
        """
        jlevel = ssc._sc._getJavaStorageLevel(storageLevel)

        try:
            helperClass = ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \
                .loadClass("org.apache.spark.streaming.mqtt.MQTTUtilsPythonHelper")
            helper = helperClass.newInstance()
            jstream = helper.createStream(ssc._jssc, brokerUrl, topic, jlevel)
        except Py4JJavaError as e:
            if 'ClassNotFoundException' in str(e.java_exception):
                MQTTUtils._printErrorMsg(ssc.sparkContext)
            raise e

        return DStream(jstream, ssc, UTF8Deserializer())
Esempio n. 13
0
    def createPullingStreamAsRawBytes(
            ssc,
            queueName,
            accessKeyId,
            accessKeySecret,
            endpoint,
            storageLevel=StorageLevel.MEMORY_AND_DISK_SER_2):
        """
        :param ssc: StreamingContext object.
        :param queueName: The name of MNS queue.
        :param accessKeyId: Aliyun Access Key ID.
        :param accessKeySecret: Aliyun Access Key Secret.
        :param endpoint: The endpoint of MNS service.
        :param storageLevel: RDD storage level.
        :return: A DStream object.
        """
        try:
            helperClass = ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \
                .loadClass("org.apache.spark.streaming.aliyun.mns.MnsUtilsHelper")
            helper = helperClass.newInstance()
            jlevel = ssc._sc._getJavaStorageLevel(storageLevel)
            jstream = helper.createPullingStreamAsRawBytes(
                ssc._jssc, queueName, accessKeyId, accessKeySecret, endpoint,
                jlevel)

        except Py4JJavaError as e:
            # TODO: use --jar once it also work on driver
            if 'ClassNotFoundException' in str(e.java_exception):
                MnsUtils._printErrorMsg()
            raise e
        return DStream(jstream, ssc, UTF8Deserializer())
Esempio n. 14
0
    def createDirectStream(ssc, logServiceProject, logStoreName, loghubConsumerGroupName,
                           accessKeyId, accessKeySecret, loghubEndpoint,
                           zkParams, cursorPositionMode, cursorStartTime=-1):
        """
        :param ssc: StreamingContext object.
        :param logServiceProject: The name of `LogService` project.
        :param logStoreName: The name of logStore.
        :param loghubConsumerGroupName: The group name of loghub consumer. All consumer process which has the same group
                                       name will consumer specific logStore together.
        :param accessKeyId: Aliyun Access Key ID.
        :param accessKeySecret: Aliyun Access Key Secret.
        :param loghubEndpoint: The endpoint of loghub.
        :param zkParams: Zookeeper properties.
        :param cursorPositionMode: Set user defined cursor mode.
        :param cursorStartTime: Set user defined cursor position (Unix Timestamp), -1 default.
        :return: A Direct api DStream object.
        """
        try:
            helperClass = ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \
                .loadClass("org.apache.spark.streaming.aliyun.logservice.LoghubUtilsHelper")
            helper = helperClass.newInstance()
            jstream = helper.createDirectStream(ssc._jssc, logServiceProject, logStoreName, loghubConsumerGroupName,
                                                accessKeyId, accessKeySecret, loghubEndpoint, zkParams,
                                                cursorPositionMode, cursorStartTime)
        except Py4JJavaError as e:
            if 'ClassNotFoundException' in str(e.java_exception):
                LoghubUtils._printErrorMsg()
            raise e

        return DStream(jstream, ssc, UTF8Deserializer())
Esempio n. 15
0
    def createStream(ssc, hostname, port, queueName, threadNum,
                     storageLevel=StorageLevel.MEMORY_AND_DISK_2,
                     decoder=utf8_decoder):
        """
        Create an input stream that pulls events from Memcache.

        :param ssc:  StreamingContext object
        :param hostname:  Hostname of the slave machine to which the flume data will be sent
        :param port:  Port of the slave machine to which the flume data will be sent
        :param storageLevel:  Storage level to use for storing the received objects
        :return: A DStream object
        """
        jlevel = ssc._sc._getJavaStorageLevel(storageLevel)
        helper = MQUtils._get_helper(ssc._sc)
        jstream = helper.createStream(ssc._jssc, hostname, port, queueName, threadNum, jlevel)
        stream = DStream(jstream, ssc, NoOpSerializer())
        return stream.map(lambda v: decoder(v))
Esempio n. 16
0
class KafkaUtils(object):
    @staticmethod
    def createStream(ssc,
                     zkQuorum,
                     groupId,
                     topics,
                     kafkaParams={},
                     storageLevel=StorageLevel.MEMORY_AND_DISK_SER_2,
                     keyDecoder=utf8_decoder,
                     valueDecoder=utf8_decoder):
        """
        Create an input stream that pulls messages from a Kafka Broker.

        :param ssc:  StreamingContext object
        :param zkQuorum:  Zookeeper quorum (hostname:port,hostname:port,..).
        :param groupId:  The group id for this consumer.
        :param topics:  Dict of (topic_name -> numPartitions) to consume.
                        Each partition is consumed in its own thread.
        :param kafkaParams: Additional params for Kafka
        :param storageLevel:  RDD storage level.
        :param keyDecoder:  A function used to decode key (default is utf8_decoder)
        :param valueDecoder:  A function used to decode value (default is utf8_decoder)
        :return: A DStream object
        """
        java_import(ssc._jvm, "org.apache.spark.streaming.kafka.KafkaUtils")

        kafkaParams.update({
            "zookeeper.connect": zkQuorum,
            "group.id": groupId,
            "zookeeper.connection.timeout.ms": "10000",
        })
        if not isinstance(topics, dict):
            raise TypeError("topics should be dict")
        jtopics = MapConverter().convert(
            topics, ssc.sparkContext._gateway._gateway_client)
        jparam = MapConverter().convert(
            kafkaParams, ssc.sparkContext._gateway._gateway_client)
        jlevel = ssc._sc._getJavaStorageLevel(storageLevel)

        def getClassByName(name):
            return ssc._jvm.org.apache.spark.util.Utils.classForName(name)

        try:
            array = getClassByName("[B")
            decoder = getClassByName("kafka.serializer.DefaultDecoder")
            jstream = ssc._jvm.KafkaUtils.createStream(ssc._jssc, array, array,
                                                       decoder, decoder,
                                                       jparam, jtopics, jlevel)
        except Py4JError, e:
            # TODO: use --jar once it also work on driver
            if not e.message or 'call a package' in e.message:
                print "No kafka package, please put the assembly jar into classpath:"
                print " $ bin/spark-submit --driver-class-path external/kafka-assembly/target/" + \
                      "scala-*/spark-streaming-kafka-assembly-*.jar"
            raise e
        ser = PairDeserializer(NoOpSerializer(), NoOpSerializer())
        stream = DStream(jstream, ssc, ser)
        return stream.map(lambda (k, v): (keyDecoder(k), valueDecoder(v)))
    def createStream(ssc, arn, destination, awsRegion, awsKey, awsKecret,
                     initialPosition, storageLevel, maxRecord, interval):
        sc = ssc.sparkContext
        java_import(
            sc._gateway.jvm,
            "org.apache.spark.streaming.kinesis.DynamoDBStream.DynamoDBPythonUtils"
        )
        DynamoDBPythonUtils = sc._gateway.jvm.DynamoDBPythonUtils

        jlevel = ssc._sc._getJavaStorageLevel(storageLevel)
        jstream = DynamoDBPythonUtils.DynamoDBStream(ssc._jssc, arn,
                                                     destination, awsRegion,
                                                     awsKey, awsKecret,
                                                     initialPosition, jlevel,
                                                     maxRecord, interval)

        stream = DStream(jstream, ssc, NoOpSerializer())
        return stream.map(lambda record: record.decode("utf-8")
                          if record is not None else None)
Esempio n. 18
0
    def createStream(ssc, zkQuorum, groupId, topics, kafkaParams=None,
                     storageLevel=StorageLevel.MEMORY_AND_DISK_SER_2,
                     keyDecoder=utf8_decoder, valueDecoder=utf8_decoder):
        """
        Create an input stream that pulls messages from a Kafka Broker.

        :param ssc:  StreamingContext object
        :param zkQuorum:  Zookeeper quorum (hostname:port,hostname:port,..).
        :param groupId:  The group id for this consumer.
        :param topics:  Dict of (topic_name -> numPartitions) to consume.
                        Each partition is consumed in its own thread.
        :param kafkaParams: Additional params for Kafka
        :param storageLevel:  RDD storage level.
        :param keyDecoder:  A function used to decode key (default is utf8_decoder)
        :param valueDecoder:  A function used to decode value (default is utf8_decoder)
        :return: A DStream object
        """
        if kafkaParams is None:
            kafkaParams = dict()
        kafkaParams.update({
            "zookeeper.connect": zkQuorum,
            "group.id": groupId,
            "zookeeper.connection.timeout.ms": "10000",
        })
        if not isinstance(topics, dict):
            raise TypeError("topics should be dict")
        jlevel = ssc._sc._getJavaStorageLevel(storageLevel)

        try:
            # Use KafkaUtilsPythonHelper to access Scala's KafkaUtils (see SPARK-6027)
            helperClass = ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader()\
                .loadClass("org.apache.spark.streaming.kafka.KafkaUtilsPythonHelper")
            helper = helperClass.newInstance()
            jstream = helper.createStream(ssc._jssc, kafkaParams, topics, jlevel)
        except Py4JJavaError as e:
            # TODO: use --jar once it also work on driver
            if 'ClassNotFoundException' in str(e.java_exception):
                KafkaUtils._printErrorMsg(ssc.sparkContext)
            raise e
        ser = PairDeserializer(NoOpSerializer(), NoOpSerializer())
        stream = DStream(jstream, ssc, ser)
        return stream.map(lambda k_v: (keyDecoder(k_v[0]), valueDecoder(k_v[1])))
Esempio n. 19
0
    def createStream(ssc, host, port, address):

        try:
            helper = ssc._jvm.org.apache.spark.streaming.amqp.AMQPUtilsPythonHelper(
            )
        except TypeError as e:
            if str(e) == "'JavaPackage' object is not callable":
                AMQPUtils._printErrorMsg(ssc.sparkContext)
            raise

        jstream = helper.createStream(ssc._jssc, host, port, address)
        return DStream(jstream, ssc, UTF8Deserializer())
 def handle(self, data: DStream):
     """
     Stream Handler
     :param data: input dstream
     :return: None
     """
     try:
         dataStream = data.transform(lambda time, rdd: self.transformer(time, rdd))
         # process rdds of each batch
         dataStream.foreachRDD(self.processor)
     except Exception as e:
         logging.error(e)
         print(e)
         return
    def handle(self, data: DStream):
        """
        Stream Handler
        :param data: input dstream
        :return: None
        """
        try:
            data = data.transform(self.transformer)
            # process rdds of each batch
        except Exception as e:
            logging.error(e)
            print(e)

        return data
Esempio n. 22
0
    def createPairedStream(ssc, brokerUrl, topics,
                     storageLevel=StorageLevel.MEMORY_AND_DISK_2):
        """
        Create an input stream that pulls messages from a Mqtt Broker.

        :param ssc:  StreamingContext object
        :param brokerUrl:  Url of remote mqtt publisher
        :param topics:  topic names to subscribe to
        :param storageLevel:  RDD storage level.
        :return: A DStream object
        """
        jlevel = ssc._sc._getJavaStorageLevel(storageLevel)
        helper = MQTTUtils._get_helper(ssc._sc)
        jstream = helper.createStream(ssc._jssc, brokerUrl, topics, jlevel)
        return DStream(jstream, ssc, UTF8Deserializer())
    def createStream(ssc, storageLevel=StorageLevel.MEMORY_AND_DISK_SER_2):
        """
        Create an input stream that pulls messages from a Event Hub.

        :param ssc:  StreamingContext object        
        :param storageLevel:  RDD storage level.
        :return: A DStream object
        """
        jlevel = ssc._sc._getJavaStorageLevel(storageLevel)

        try:
            helperClass = ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \
                .loadClass("com.ge.predix.predixinsights.eventhub.EventHubUtilsPythonHelper")
            helper = helperClass.newInstance()
            jstream = helper.createStream(ssc._jssc, jlevel)
        except Py4JJavaError as e:
            if 'ClassNotFoundException' in str(e.java_exception):
                EHUtils._printErrorMsg(ssc.sparkContext)
            raise e

        return DStream(jstream, ssc, UTF8Deserializer())
Esempio n. 24
0
    def createStream(ssc,
                     brokerUrl,
                     topic,
                     username,
                     password,
                     storageLevel=StorageLevel.MEMORY_AND_DISK_2):
        """
        Create an input stream that pulls messages from a Mqtt Broker.

        :param ssc:  StreamingContext object
        :param brokerUrl:  Url of remote mqtt publisher
        :param topic:  topic name to subscribe to
        :param username:  the vitual host name : username or username
        :param password:  the password of mqtt
        :param storageLevel:  RDD storage level.
        :return: A DStream object
        """
        jlevel = ssc._sc._getJavaStorageLevel(storageLevel)
        helper = MQTTUtils._get_helper(ssc._sc)
        jstream = helper.createStream(ssc._jssc, brokerUrl, topic, jlevel,
                                      username, password)
        return DStream(jstream, ssc, UTF8Deserializer())
Esempio n. 25
0
    def createStreams(ssc,
                      logServiceProject,
                      logStoreName,
                      loghubConsumerGroupName,
                      loghubEndpoint,
                      numReceivers,
                      accessKeyId,
                      accessKeySecret,
                      storageLevel=StorageLevel.MEMORY_AND_DISK_SER_2):
        """
        :param ssc: StreamingContext object.
        :param logServiceProject: The name of `LogService` project.
        :param logStoreName: The name of logStore.
        :param loghubConsumerGroupName: The group name of loghub consumer. All consumer process which has the same group
                                       name will consumer specific logStore together.
        :param loghubEndpoint: The endpoint of loghub.
        :param numReceivers: The number of receivers.
        :param accessKeyId: Aliyun Access Key ID.
        :param accessKeySecret: Aliyun Access Key Secret.
        :param storageLevel: RDD storage level.
        :return: A DStream object.
        """
        try:
            helperClass = ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \
                .loadClass("org.apache.spark.streaming.aliyun.logservice.LoghubUtilsHelper")
            helper = helperClass.newInstance()
            jlevel = ssc._sc._getJavaStorageLevel(storageLevel)
            jstream = helper.createStream(ssc._jssc, logServiceProject,
                                          logStoreName,
                                          loghubConsumerGroupName,
                                          loghubEndpoint, numReceivers,
                                          accessKeyId, accessKeySecret, jlevel)

        except Py4JJavaError as e:
            # TODO: use --jar once it also work on driver
            if 'ClassNotFoundException' in str(e.java_exception):
                LoghubUtils._printErrorMsg()
            raise e
        return DStream(jstream, ssc, UTF8Deserializer())
Esempio n. 26
0
    def createStream(ssc,
                     brokerUrl,
                     topic,
                     storageLevel=StorageLevel.MEMORY_AND_DISK_2):
        """
        Create an input stream that pulls messages from a Mqtt Broker.

        :param ssc:  StreamingContext object
        :param brokerUrl:  Url of remote mqtt publisher
        :param topic:  topic name to subscribe to
        :param storageLevel:  RDD storage level.
        :return: A DStream object
        """
        try:
            helper = ssc._jvm.org.apache.spark.streaming.mqtt.MQTTUtilsPythonHelper(
            )
        except TypeError as e:
            if str(e) == "'JavaPackage' object is not callable":
                MQTTUtils._printErrorMsg(ssc.sparkContext)
            raise

        jlevel = ssc._sc._getJavaStorageLevel(storageLevel)
        jstream = helper.createStream(ssc._jssc, brokerUrl, topic, jlevel)
        return DStream(jstream, ssc, UTF8Deserializer())
Esempio n. 27
0
    def _toPythonDStream(ssc, jstream, bodyDecoder):
        ser = PairDeserializer(NoOpSerializer(), NoOpSerializer())
        stream = DStream(jstream, ssc, ser)

        return stream
Esempio n. 28
0
from pyspark.streaming import StreamingContext

sc = SparkContext("local[2]", appName="jms py")
ssc = StreamingContext(sc, 5)

helperClass = ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader(
).loadClass("com.redhat.spark.streaming.jms.JMSUtilsPythonHelper")
helper = helperClass.newInstance()

jbrokerURL = "amqp://127.0.0.1:5672"
jqueuename = "default"
jlevel = ssc._sc._getJavaStorageLevel(StorageLevel.MEMORY_AND_DISK_SER_2)
jstream = helper.createStream(ssc._jssc, jbrokerURL, jqueuename, jlevel)

ser = PairDeserializer(NoOpSerializer(), NoOpSerializer())
stream = DStream(jstream, ssc, ser)
utf8_decoder = lambda s: s and s.decode('utf-8')
keyDecoder = utf8_decoder
valueDecoder = utf8_decoder
a = stream.map(lambda (k, v): (keyDecoder(k), valueDecoder(v)))


def process(rdd):
    print rdd.count()


def protect(func):
    def _protect(rdd):
        if rdd.take(1):
            func(rdd)
Esempio n. 29
0
    def createStream(
        ssc: StreamingContext,
        kinesisAppName: str,
        streamName: str,
        endpointUrl: str,
        regionName: str,
        initialPositionInStream: str,
        checkpointInterval: int,
        metricsLevel: int = MetricsLevel.DETAILED,
        storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_2,
        awsAccessKeyId: Optional[str] = None,
        awsSecretKey: Optional[str] = None,
        decoder: Union[
            Callable[[Optional[bytes]], T], Callable[[Optional[bytes]], Optional[str]]
        ] = utf8_decoder,
        stsAssumeRoleArn: Optional[str] = None,
        stsSessionName: Optional[str] = None,
        stsExternalId: Optional[str] = None,
    ) -> Union["DStream[Union[T, Optional[str]]]", "DStream[T]"]:
        """
        Create an input stream that pulls messages from a Kinesis stream. This uses the
        Kinesis Client Library (KCL) to pull messages from Kinesis.

        Parameters
        ----------
        ssc : :class:`StreamingContext`
            StreamingContext object
        kinesisAppName : str
            Kinesis application name used by the Kinesis Client Library (KCL) to
            update DynamoDB
        streamName : str
            Kinesis stream name
        endpointUrl : str
            Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com)
        regionName : str
            Name of region used by the Kinesis Client Library (KCL) to update
            DynamoDB (lease coordination and checkpointing) and CloudWatch (metrics)
        initialPositionInStream : int
            In the absence of Kinesis checkpoint info, this is the
            worker's initial starting position in the stream. The
            values are either the beginning of the stream per Kinesis'
            limit of 24 hours (InitialPositionInStream.TRIM_HORIZON) or
            the tip of the stream (InitialPositionInStream.LATEST).
        checkpointInterval : int
            Checkpoint interval(in seconds) for Kinesis checkpointing. See the Kinesis
            Spark Streaming documentation for more details on the different
            types of checkpoints.
        metricsLevel : int
            Level of CloudWatch PutMetrics.
            Can be set to either DETAILED, SUMMARY, or NONE. (default is DETAILED)
        storageLevel : :class:`pyspark.StorageLevel`, optional
            Storage level to use for storing the received objects (default is
            StorageLevel.MEMORY_AND_DISK_2)
        awsAccessKeyId : str, optional
            AWS AccessKeyId (default is None. If None, will use
            DefaultAWSCredentialsProviderChain)
        awsSecretKey : str, optional
            AWS SecretKey (default is None. If None, will use
            DefaultAWSCredentialsProviderChain)
        decoder : function, optional
            A function used to decode value (default is utf8_decoder)
        stsAssumeRoleArn : str, optional
            ARN of IAM role to assume when using STS sessions to read from
            the Kinesis stream (default is None).
        stsSessionName : str, optional
            Name to uniquely identify STS sessions used to read from Kinesis
            stream, if STS is being used (default is None).
        stsExternalId : str, optional
            External ID that can be used to validate against the assumed IAM
            role's trust policy, if STS is being used (default is None).

        Returns
        -------
        A DStream object

        Notes
        -----
        The given AWS credentials will get saved in DStream checkpoints if checkpointing
        is enabled. Make sure that your checkpoint directory is secure.
        """
        jlevel = ssc._sc._getJavaStorageLevel(storageLevel)
        jduration = ssc._jduration(checkpointInterval)

        jvm = ssc._jvm
        assert jvm is not None

        try:
            helper = jvm.org.apache.spark.streaming.kinesis.KinesisUtilsPythonHelper()
        except TypeError as e:
            if str(e) == "'JavaPackage' object is not callable":
                _print_missing_jar(
                    "Streaming's Kinesis",
                    "streaming-kinesis-asl",
                    "streaming-kinesis-asl-assembly",
                    ssc.sparkContext.version,
                )
            raise
        jstream = helper.createStream(
            ssc._jssc,
            kinesisAppName,
            streamName,
            endpointUrl,
            regionName,
            initialPositionInStream,
            jduration,
            metricsLevel,
            jlevel,
            awsAccessKeyId,
            awsSecretKey,
            stsAssumeRoleArn,
            stsSessionName,
            stsExternalId,
        )
        stream: DStream = DStream(jstream, ssc, NoOpSerializer())
        return stream.map(lambda v: decoder(v))
Esempio n. 30
0
 def __init__(self, jdstream, ssc, jrdd_deserializer):
     warnings.warn(
         "Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. "
         "See SPARK-21893.",
         DeprecationWarning)
     DStream.__init__(self, jdstream, ssc, jrdd_deserializer)
Esempio n. 31
0
        method.routing_key,
        body,
    )

    dadosClassificados = classificar(body)

    mensagem = dadosClassificados[0] + "|" + dadosClassificados[
        1] + "|" + dadosClassificados[2]
    connection = pika.BlockingConnection(
        pika.ConnectionParameters('192.168.25.57', 5672, '/starwars',
                                  credentials))
    channel = connection.channel()
    channel.exchange_declare(exchange='topic_logs', type='topic')
    channel.basic_publish(exchange='topic_logs',
                          routing_key=method.routing_key,
                          body=mensagem)
    channel.basic_publish(exchange='topic_logs',
                          routing_key="TODOS",
                          body=mensagem)
    print " [x] Sent %r:%r" % (method.routing_key, mensagem)
    connection.close()


channel.basic_consume(callback, queue=queue_name, no_ack=True)

sc = SparkContext("spark://172.16.207.155:8088", "consumer")
ssc = StreamingContext(sc, 1)
CSR = DStream(channel.start_consuming())
ssc.start()
ssc.awaitTermination()
Esempio n. 32
0
 def __init__(self, jdstream, ssc, jrdd_deserializer):
     DStream.__init__(self, jdstream, ssc, jrdd_deserializer)
Esempio n. 33
0
from pyspark.streaming import StreamingContext

sc = SparkContext("local[2]", appName="jms py")
ssc = StreamingContext(sc, 5)

helperClass = ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader().loadClass("com.redhat.spark.streaming.jms.JMSUtilsPythonHelper")
helper = helperClass.newInstance()

jbrokerURL = "amqp://127.0.0.1:5672"
jqueuename = "default"
jlevel = ssc._sc._getJavaStorageLevel(StorageLevel.MEMORY_AND_DISK_SER_2)
jstream = helper.createStream(ssc._jssc, jbrokerURL, jqueuename, jlevel)

ser = PairDeserializer(NoOpSerializer(), NoOpSerializer())
stream = DStream(jstream, ssc, ser)
utf8_decoder = lambda s: s and s.decode('utf-8')
keyDecoder = utf8_decoder
valueDecoder = utf8_decoder
a = stream.map(lambda (k, v): (keyDecoder(k), valueDecoder(v)))

def process(rdd):
   print rdd.count()

def protect(func):
   def _protect(rdd):
     if rdd.take(1):
       func(rdd)
   return _protect

a.foreachRDD(protect(process))
Esempio n. 34
0
        def run_spark_job(queue: Queue,
                          _agg_function: AggregationFunction,
                          _agg_window_millis: int,
                          _spark_opts: dict = {},
                          _environment: dict = {}):
            os.environ.update(_environment)
            try:
                try:
                    import findspark
                    findspark.init()
                except Exception as ex:
                    self.logger.warn("Cannot import Spark pyspark with"
                                     " findspark. Message: {}".format(str(ex)))
                    pass

                from pyspark.sql import SparkSession
                from pyspark.streaming import StreamingContext
                from pyspark.sql.functions import expr, window
                from pyspark.serializers import NoOpSerializer
                from pyspark.streaming import DStream
                from pyspark.streaming.kafka import utf8_decoder

                spark_builder = SparkSession \
                    .builder \

                for k in _spark_opts:
                    spark_builder = spark_builder.config(k, _spark_opts[k])

                spark_builder \
                    .appName(str(self)) \
                    .config("spark.jars.packages",
                            "org.apache.spark:spark-streaming-kafka-0-8_2.11:2.2.1,"
                            "org.apache.bahir:spark-streaming-pubsub_2.11:2.2.1") \
                    .config("spark.jars",
                            BASE_PATH + "/lib/streaming-pubsub-serializer_2.11-0.1.jar")

                spark = spark_builder.getOrCreate()
                spark.sparkContext.setLogLevel("WARN")
                ssc = StreamingContext(spark.sparkContext,
                                       (agg_window_millis / 1000))

                agg = expr("value")
                if _agg_function == AggregationFunction.AVG:
                    agg = expr("avg(value)")
                elif _agg_function == AggregationFunction.SUM:
                    agg = expr("sum(value)")
                elif _agg_function == AggregationFunction.COUNT:
                    agg = expr("count(value)")
                elif _agg_function == AggregationFunction.P50:
                    agg = expr("percentile(value, 0.5)")
                elif _agg_function == AggregationFunction.P75:
                    agg = expr("percentile(value, 0.75)")
                elif _agg_function == AggregationFunction.P95:
                    agg = expr("percentile(value, 0.95)")
                elif _agg_function == AggregationFunction.P99:
                    agg = expr("percentile(value, 0.99)")

                deserializer = \
                    ssc._jvm.org.apache.spark.streaming.pubsub.SparkPubsubMessageSerializer()  # noqa: E501
                pubsub_utils = \
                    ssc._jvm.org.apache.spark.streaming.pubsub.PubsubUtils
                credentials = \
                    ssc._jvm.org.apache.spark.streaming.pubsub.SparkGCPCredentials
                storage_level = \
                    ssc._jvm.org.apache.spark.storage.StorageLevel

                _pubsub_stream = pubsub_utils \
                    .createStream(ssc._jssc,
                                  project_id,
                                  subscription,
                                  credentials.Builder().build(),
                                  storage_level.DISK_ONLY())
                _pubsub_stream_des = _pubsub_stream.transform(deserializer)
                ser = NoOpSerializer()
                pubsub_stream = DStream(_pubsub_stream_des, ssc,
                                        ser).map(utf8_decoder)

                def aggregate_rdd(_queue, _agg, df, ts):

                    secs = int(self.agg_window_millis / 1000)
                    win = window("ts", "{}  seconds".format(secs))
                    if df.first():
                        aggs = df \
                            .groupBy("application", win) \
                            .agg(_agg.alias("value")) \
                            .collect()

                        for row in aggs:
                            message = InputMessage(row["application"],
                                                   value=row["value"],
                                                   ts=ts)
                            self.logger.debug("Enqueue: {}".format(
                                message.to_json()))
                            try:
                                _queue.put(message.to_json())
                            except AssertionError as ex:
                                self.logger.warn(str(ex))
                    else:
                        self.logger.warn("Empty RDD")

                # Create kafka stream
                pubsub_stream \
                    .foreachRDD(lambda ts, rdd:
                                aggregate_rdd(queue, agg,
                                              spark.read.json(rdd), ts))

                # Run
                ssc.start()
                if "timeout" in _spark_opts:
                    ssc.awaitTerminationOrTimeout(_spark_opts["timeout"])
                    ssc.stop()
                    spark.stop()
                else:
                    ssc.awaitTermination()
                    ssc.stop()
                    spark.stop()

            except Exception as e:
                raise e
Esempio n. 35
0
    def createDirectStream(ssc, topics, kafkaParams, fromOffsets=None,
                           keyDecoder=utf8_decoder, valueDecoder=utf8_decoder,
                           messageHandler=None):
        """
        .. note:: Experimental

        Create an input stream that directly pulls messages from a Kafka Broker and specific offset.

        This is not a receiver based Kafka input stream, it directly pulls the message from Kafka
        in each batch duration and processed without storing.

        This does not use Zookeeper to store offsets. The consumed offsets are tracked
        by the stream itself. For interoperability with Kafka monitoring tools that depend on
        Zookeeper, you have to update Kafka/Zookeeper yourself from the streaming application.
        You can access the offsets used in each batch from the generated RDDs (see

        To recover from driver failures, you have to enable checkpointing in the StreamingContext.
        The information on consumed offset can be recovered from the checkpoint.
        See the programming guide for details (constraints, etc.).

        :param ssc:  StreamingContext object.
        :param topics:  list of topic_name to consume.
        :param kafkaParams: Additional params for Kafka.
        :param fromOffsets: Per-topic/partition Kafka offsets defining the (inclusive) starting
                            point of the stream.
        :param keyDecoder:  A function used to decode key (default is utf8_decoder).
        :param valueDecoder:  A function used to decode value (default is utf8_decoder).
        :param messageHandler: A function used to convert KafkaMessageAndMetadata. You can assess
                               meta using messageHandler (default is None).
        :return: A DStream object
        """
        if fromOffsets is None:
            fromOffsets = dict()
        if not isinstance(topics, list):
            raise TypeError("topics should be list")
        if not isinstance(kafkaParams, dict):
            raise TypeError("kafkaParams should be dict")

        def funcWithoutMessageHandler(k_v):
            return (keyDecoder(k_v[0]), valueDecoder(k_v[1]))

        def funcWithMessageHandler(m):
            m._set_key_decoder(keyDecoder)
            m._set_value_decoder(valueDecoder)
            return messageHandler(m)

        helper = KafkaUtils._get_helper(ssc._sc)

        jfromOffsets = dict([(k._jTopicAndPartition(helper),
                              v) for (k, v) in fromOffsets.items()])
        if messageHandler is None:
            ser = PairDeserializer(NoOpSerializer(), NoOpSerializer())
            func = funcWithoutMessageHandler
            jstream = helper.createDirectStreamWithoutMessageHandler(
                ssc._jssc, kafkaParams, set(topics), jfromOffsets)
        else:
            ser = AutoBatchedSerializer(PickleSerializer())
            func = funcWithMessageHandler
            jstream = helper.createDirectStreamWithMessageHandler(
                ssc._jssc, kafkaParams, set(topics), jfromOffsets)

        stream = DStream(jstream, ssc, ser).map(func)
        return KafkaDStream(stream._jdstream, ssc, stream._jrdd_deserializer)
Esempio n. 36
0
class KafkaUtils(object):
    @staticmethod
    def createStream(ssc,
                     zkQuorum,
                     groupId,
                     topics,
                     kafkaParams={},
                     storageLevel=StorageLevel.MEMORY_AND_DISK_SER_2,
                     keyDecoder=utf8_decoder,
                     valueDecoder=utf8_decoder):
        """
        Create an input stream that pulls messages from a Kafka Broker.

        :param ssc:  StreamingContext object
        :param zkQuorum:  Zookeeper quorum (hostname:port,hostname:port,..).
        :param groupId:  The group id for this consumer.
        :param topics:  Dict of (topic_name -> numPartitions) to consume.
                        Each partition is consumed in its own thread.
        :param kafkaParams: Additional params for Kafka
        :param storageLevel:  RDD storage level.
        :param keyDecoder:  A function used to decode key (default is utf8_decoder)
        :param valueDecoder:  A function used to decode value (default is utf8_decoder)
        :return: A DStream object
        """
        kafkaParams.update({
            "zookeeper.connect": zkQuorum,
            "group.id": groupId,
            "zookeeper.connection.timeout.ms": "10000",
        })
        if not isinstance(topics, dict):
            raise TypeError("topics should be dict")
        jtopics = MapConverter().convert(
            topics, ssc.sparkContext._gateway._gateway_client)
        jparam = MapConverter().convert(
            kafkaParams, ssc.sparkContext._gateway._gateway_client)
        jlevel = ssc._sc._getJavaStorageLevel(storageLevel)

        try:
            # Use KafkaUtilsPythonHelper to access Scala's KafkaUtils (see SPARK-6027)
            helperClass = ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader()\
                .loadClass("org.apache.spark.streaming.kafka.KafkaUtilsPythonHelper")
            helper = helperClass.newInstance()
            jstream = helper.createStream(ssc._jssc, jparam, jtopics, jlevel)
        except Py4JJavaError, e:
            # TODO: use --jar once it also work on driver
            if 'ClassNotFoundException' in str(e.java_exception):
                print """
________________________________________________________________________________________________

  Spark Streaming's Kafka libraries not found in class path. Try one of the following.

  1. Include the Kafka library and its dependencies with in the
     spark-submit command as

     $ bin/spark-submit --packages org.apache.spark:spark-streaming-kafka:%s ...

  2. Download the JAR of the artifact from Maven Central http://search.maven.org/,
     Group Id = org.apache.spark, Artifact Id = spark-streaming-kafka-assembly, Version = %s.
     Then, include the jar in the spark-submit command as

     $ bin/spark-submit --jars <spark-streaming-kafka-assembly.jar> ...

________________________________________________________________________________________________

""" % (ssc.sparkContext.version, ssc.sparkContext.version)
            raise e
        ser = PairDeserializer(NoOpSerializer(), NoOpSerializer())
        stream = DStream(jstream, ssc, ser)
        return stream.map(lambda (k, v): (keyDecoder(k), valueDecoder(v)))