Beispiel #1
0
    def createStream(ssc, zkQuorum, groupId, topics, kafkaParams=None,
                     storageLevel=StorageLevel.MEMORY_AND_DISK_2,
                     keyDecoder=utf8_decoder, valueDecoder=utf8_decoder):
        """
        Create an input stream that pulls messages from a Kafka Broker.

        :param ssc:  StreamingContext object
        :param zkQuorum:  Zookeeper quorum (hostname:port,hostname:port,..).
        :param groupId:  The group id for this consumer.
        :param topics:  Dict of (topic_name -> numPartitions) to consume.
                        Each partition is consumed in its own thread.
        :param kafkaParams: Additional params for Kafka
        :param storageLevel:  RDD storage level.
        :param keyDecoder:  A function used to decode key (default is utf8_decoder)
        :param valueDecoder:  A function used to decode value (default is utf8_decoder)
        :return: A DStream object
        """
        if kafkaParams is None:
            kafkaParams = dict()
        kafkaParams.update({
            "zookeeper.connect": zkQuorum,
            "group.id": groupId,
            "zookeeper.connection.timeout.ms": "10000",
        })
        if not isinstance(topics, dict):
            raise TypeError("topics should be dict")
        jlevel = ssc._sc._getJavaStorageLevel(storageLevel)
        helper = KafkaUtils._get_helper(ssc._sc)
        jstream = helper.createStream(ssc._jssc, kafkaParams, topics, jlevel)
        ser = PairDeserializer(NoOpSerializer(), NoOpSerializer())
        stream = DStream(jstream, ssc, ser)
        return stream.map(lambda k_v: (keyDecoder(k_v[0]), valueDecoder(k_v[1])))
Beispiel #2
0
class KafkaUtils(object):
    @staticmethod
    def createStream(ssc,
                     zkQuorum,
                     groupId,
                     topics,
                     kafkaParams={},
                     storageLevel=StorageLevel.MEMORY_AND_DISK_SER_2,
                     keyDecoder=utf8_decoder,
                     valueDecoder=utf8_decoder):
        """
        Create an input stream that pulls messages from a Kafka Broker.

        :param ssc:  StreamingContext object
        :param zkQuorum:  Zookeeper quorum (hostname:port,hostname:port,..).
        :param groupId:  The group id for this consumer.
        :param topics:  Dict of (topic_name -> numPartitions) to consume.
                        Each partition is consumed in its own thread.
        :param kafkaParams: Additional params for Kafka
        :param storageLevel:  RDD storage level.
        :param keyDecoder:  A function used to decode key (default is utf8_decoder)
        :param valueDecoder:  A function used to decode value (default is utf8_decoder)
        :return: A DStream object
        """
        java_import(ssc._jvm, "org.apache.spark.streaming.kafka.KafkaUtils")

        kafkaParams.update({
            "zookeeper.connect": zkQuorum,
            "group.id": groupId,
            "zookeeper.connection.timeout.ms": "10000",
        })
        if not isinstance(topics, dict):
            raise TypeError("topics should be dict")
        jtopics = MapConverter().convert(
            topics, ssc.sparkContext._gateway._gateway_client)
        jparam = MapConverter().convert(
            kafkaParams, ssc.sparkContext._gateway._gateway_client)
        jlevel = ssc._sc._getJavaStorageLevel(storageLevel)

        def getClassByName(name):
            return ssc._jvm.org.apache.spark.util.Utils.classForName(name)

        try:
            array = getClassByName("[B")
            decoder = getClassByName("kafka.serializer.DefaultDecoder")
            jstream = ssc._jvm.KafkaUtils.createStream(ssc._jssc, array, array,
                                                       decoder, decoder,
                                                       jparam, jtopics, jlevel)
        except Py4JError, e:
            # TODO: use --jar once it also work on driver
            if not e.message or 'call a package' in e.message:
                print "No kafka package, please put the assembly jar into classpath:"
                print " $ bin/spark-submit --driver-class-path external/kafka-assembly/target/" + \
                      "scala-*/spark-streaming-kafka-assembly-*.jar"
            raise e
        ser = PairDeserializer(NoOpSerializer(), NoOpSerializer())
        stream = DStream(jstream, ssc, ser)
        return stream.map(lambda (k, v): (keyDecoder(k), valueDecoder(v)))
Beispiel #3
0
    def createDirectStream(ssc,
                           topics,
                           kafkaParams,
                           fromOffsets=None,
                           keyDecoder=utf8_decoder,
                           valueDecoder=utf8_decoder):
        """
        .. note:: Experimental

        Create an input stream that directly pulls messages from a Kafka Broker and specific offset.

        This is not a receiver based Kafka input stream, it directly pulls the message from Kafka
        in each batch duration and processed without storing.

        This does not use Zookeeper to store offsets. The consumed offsets are tracked
        by the stream itself. For interoperability with Kafka monitoring tools that depend on
        Zookeeper, you have to update Kafka/Zookeeper yourself from the streaming application.
        You can access the offsets used in each batch from the generated RDDs (see

        To recover from driver failures, you have to enable checkpointing in the StreamingContext.
        The information on consumed offset can be recovered from the checkpoint.
        See the programming guide for details (constraints, etc.).

        :param ssc:  StreamingContext object.
        :param topics:  list of topic_name to consume.
        :param kafkaParams: Additional params for Kafka.
        :param fromOffsets: Per-topic/partition Kafka offsets defining the (inclusive) starting
                            point of the stream.
        :param keyDecoder:  A function used to decode key (default is utf8_decoder).
        :param valueDecoder:  A function used to decode value (default is utf8_decoder).
        :return: A DStream object
        """
        if fromOffsets is None:
            fromOffsets = dict()
        if not isinstance(topics, list):
            raise TypeError("topics should be list")
        if not isinstance(kafkaParams, dict):
            raise TypeError("kafkaParams should be dict")

        try:
            helperClass = ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \
                .loadClass("org.apache.spark.streaming.kafka.KafkaUtilsPythonHelper")
            helper = helperClass.newInstance()

            jfromOffsets = dict([(k._jTopicAndPartition(helper), v)
                                 for (k, v) in fromOffsets.items()])
            jstream = helper.createDirectStream(ssc._jssc, kafkaParams,
                                                set(topics), jfromOffsets)
        except Py4JJavaError as e:
            if 'ClassNotFoundException' in str(e.java_exception):
                KafkaUtils._printErrorMsg(ssc.sparkContext)
            raise e

        ser = PairDeserializer(NoOpSerializer(), NoOpSerializer())
        stream = DStream(jstream, ssc, ser) \
            .map(lambda k_v: (keyDecoder(k_v[0]), valueDecoder(k_v[1])))
        return KafkaDStream(stream._jdstream, ssc, stream._jrdd_deserializer)
Beispiel #4
0
    def createRDD(sc, kafkaParams, offsetRanges, leaders=None,
                  keyDecoder=utf8_decoder, valueDecoder=utf8_decoder,
                  messageHandler=None):
        """
        Create an RDD from Kafka using offset ranges for each topic and partition.

        :param sc:  SparkContext object
        :param kafkaParams: Additional params for Kafka
        :param offsetRanges:  list of offsetRange to specify topic:partition:[start, end) to consume
        :param leaders: Kafka brokers for each TopicAndPartition in offsetRanges.  May be an empty
            map, in which case leaders will be looked up on the driver.
        :param keyDecoder:  A function used to decode key (default is utf8_decoder)
        :param valueDecoder:  A function used to decode value (default is utf8_decoder)
        :param messageHandler: A function used to convert KafkaMessageAndMetadata. You can assess
                               meta using messageHandler (default is None).
        :return: An RDD object

        .. note:: Experimental
        .. note:: Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0.
            See SPARK-21893.
        """
        warnings.warn(
            "Deprecated in 2.3.0. Kafka 0.8 support is deprecated as of Spark 2.3.0. "
            "See SPARK-21893.",
            DeprecationWarning)
        if leaders is None:
            leaders = dict()
        if not isinstance(kafkaParams, dict):
            raise TypeError("kafkaParams should be dict")
        if not isinstance(offsetRanges, list):
            raise TypeError("offsetRanges should be list")

        def funcWithoutMessageHandler(k_v):
            return (keyDecoder(k_v[0]), valueDecoder(k_v[1]))

        def funcWithMessageHandler(m):
            m._set_key_decoder(keyDecoder)
            m._set_value_decoder(valueDecoder)
            return messageHandler(m)

        helper = KafkaUtils._get_helper(sc)

        joffsetRanges = [o._jOffsetRange(helper) for o in offsetRanges]
        jleaders = dict([(k._jTopicAndPartition(helper),
                          v._jBroker(helper)) for (k, v) in leaders.items()])
        if messageHandler is None:
            jrdd = helper.createRDDWithoutMessageHandler(
                sc._jsc, kafkaParams, joffsetRanges, jleaders)
            ser = PairDeserializer(NoOpSerializer(), NoOpSerializer())
            rdd = RDD(jrdd, sc, ser).map(funcWithoutMessageHandler)
        else:
            jrdd = helper.createRDDWithMessageHandler(
                sc._jsc, kafkaParams, joffsetRanges, jleaders)
            rdd = RDD(jrdd, sc).map(funcWithMessageHandler)

        return KafkaRDD(rdd._jrdd, sc, rdd._jrdd_deserializer)
Beispiel #5
0
    def createRDD(sc, kafkaParams, offsetRanges, leaders=None,
                  keyDecoder=utf8_decoder, valueDecoder=utf8_decoder,
                  messageHandler=None):
        """
        .. note:: Experimental

        Create a RDD from Kafka using offset ranges for each topic and partition.

        :param sc:  SparkContext object
        :param kafkaParams: Additional params for Kafka
        :param offsetRanges:  list of offsetRange to specify topic:partition:[start, end) to consume
        :param leaders: Kafka brokers for each TopicAndPartition in offsetRanges.  May be an empty
            map, in which case leaders will be looked up on the driver.
        :param keyDecoder:  A function used to decode key (default is utf8_decoder)
        :param valueDecoder:  A function used to decode value (default is utf8_decoder)
        :param messageHandler: A function used to convert KafkaMessageAndMetadata. You can assess
                               meta using messageHandler (default is None).
        :return: A RDD object
        """
        if leaders is None:
            leaders = dict()
        if not isinstance(kafkaParams, dict):
            raise TypeError("kafkaParams should be dict")
        if not isinstance(offsetRanges, list):
            raise TypeError("offsetRanges should be list")

        def funcWithoutMessageHandler(k_v):
            return (keyDecoder(k_v[0]), valueDecoder(k_v[1]))

        def funcWithMessageHandler(m):
            m._set_key_decoder(keyDecoder)
            m._set_value_decoder(valueDecoder)
            return messageHandler(m)

        try:
            helperClass = sc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \
                .loadClass("org.apache.spark.streaming.kafka.KafkaUtilsPythonHelper")
            helper = helperClass.newInstance()
            joffsetRanges = [o._jOffsetRange(helper) for o in offsetRanges]
            jleaders = dict([(k._jTopicAndPartition(helper),
                              v._jBroker(helper)) for (k, v) in leaders.items()])
            if messageHandler is None:
                jrdd = helper.createRDDWithoutMessageHandler(
                    sc._jsc, kafkaParams, joffsetRanges, jleaders)
                ser = PairDeserializer(NoOpSerializer(), NoOpSerializer())
                rdd = RDD(jrdd, sc, ser).map(funcWithoutMessageHandler)
            else:
                jrdd = helper.createRDDWithMessageHandler(
                    sc._jsc, kafkaParams, joffsetRanges, jleaders)
                rdd = RDD(jrdd, sc).map(funcWithMessageHandler)
        except Py4JJavaError as e:
            if 'ClassNotFoundException' in str(e.java_exception):
                KafkaUtils._printErrorMsg(sc)
            raise e

        return KafkaRDD(rdd._jrdd, sc, rdd._jrdd_deserializer)
Beispiel #6
0
 def test_hash_serializer(self):
     hash(NoOpSerializer())
     hash(UTF8Deserializer())
     hash(CPickleSerializer())
     hash(MarshalSerializer())
     hash(AutoSerializer())
     hash(BatchedSerializer(CPickleSerializer()))
     hash(AutoBatchedSerializer(MarshalSerializer()))
     hash(PairDeserializer(NoOpSerializer(), UTF8Deserializer()))
     hash(CartesianDeserializer(NoOpSerializer(), UTF8Deserializer()))
     hash(CompressedSerializer(CPickleSerializer()))
     hash(FlattenedValuesSerializer(CPickleSerializer()))
Beispiel #7
0
    def createStream(ssc,
                     zkQuorum,
                     groupId,
                     topics,
                     kafkaParams=None,
                     storageLevel=StorageLevel.MEMORY_AND_DISK_SER_2,
                     keyDecoder=utf8_decoder,
                     valueDecoder=utf8_decoder):
        """
        Create an input stream that pulls messages from a Kafka Broker.

        :param ssc:  StreamingContext object
        :param zkQuorum:  Zookeeper quorum (hostname:port,hostname:port,..).
        :param groupId:  The group id for this consumer.
        :param topics:  Dict of (topic_name -> numPartitions) to consume.
                        Each partition is consumed in its own thread.
        :param kafkaParams: Additional params for Kafka
        :param storageLevel:  RDD storage level.
        :param keyDecoder:  A function used to decode key (default is utf8_decoder)
        :param valueDecoder:  A function used to decode value (default is utf8_decoder)
        :return: A DStream object
        """
        if kafkaParams is None:
            kafkaParams = dict()
        kafkaParams.update({
            "zookeeper.connect": zkQuorum,
            "group.id": groupId,
            "zookeeper.connection.timeout.ms": "10000",
        })
        if not isinstance(topics, dict):
            raise TypeError("topics should be dict")
        jlevel = ssc._sc._getJavaStorageLevel(storageLevel)

        try:
            # Use KafkaUtilsPythonHelper to access Scala's KafkaUtils (see SPARK-6027)
            helperClass = ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader()\
                .loadClass("org.apache.spark.streaming.kafka.KafkaUtilsPythonHelper")
            helper = helperClass.newInstance()
            jstream = helper.createStream(ssc._jssc, kafkaParams, topics,
                                          jlevel)
        except Py4JJavaError as e:
            # TODO: use --jar once it also work on driver
            if 'ClassNotFoundException' in str(e.java_exception):
                KafkaUtils._printErrorMsg(ssc.sparkContext)
            raise e
        ser = PairDeserializer(NoOpSerializer(), NoOpSerializer())
        stream = DStream(jstream, ssc, ser)
        return stream.map(lambda k_v:
                          (keyDecoder(k_v[0]), valueDecoder(k_v[1])))
Beispiel #8
0
    def _toPythonDStream(ssc, jstream, bodyDecoder):
        ser = PairDeserializer(NoOpSerializer(), NoOpSerializer())
        stream = DStream(jstream, ssc, ser)

        def func(event):
            headersBytes = BytesIO(event[0]) if sys.version >= "3" else StringIO(event[0])
            headers = {}
            strSer = UTF8Deserializer()
            for i in range(0, read_int(headersBytes)):
                key = strSer.loads(headersBytes)
                value = strSer.loads(headersBytes)
                headers[key] = value
            body = bodyDecoder(event[1])
            return (headers, body)
        return stream.map(func)
Beispiel #9
0
 def _jdstream(self):
     if self._jdstream_val:
         return self._jdstream_val
     if self._bypass_serializer:
         self.jrdd_deserializer = NoOpSerializer()
     command = (self.func, self._prev_jrdd_deserializer,
                self._jrdd_deserializer)
     # the serialized command will be compressed by broadcast
     ser = CloudPickleSerializer()
     pickled_command = ser.dumps(command)
     if pickled_command > (1 << 20):  # 1M
         broadcast = self.ctx.broadcast(pickled_command)
         pickled_command = ser.dumps(broadcast)
     broadcast_vars = ListConverter().convert(
         [x._jbroadcast for x in self.ctx._pickled_broadcast_vars],
         self.ctx._gateway._gateway_client)
     self.ctx._pickled_broadcast_vars.clear()
     class_tag = self._prev_jdstream.classTag()
     env = MapConverter().convert(self.ctx.environment,
                                  self.ctx._gateway._gateway_client)
     includes = ListConverter().convert(self.ctx._python_includes,
                                        self.ctx._gateway._gateway_client)
     python_dstream = self.ctx._jvm.PythonDStream(
         self._prev_jdstream.dstream(), bytearray(pickled_command), env,
         includes, self.preservesPartitioning, self.ctx.pythonExec,
         broadcast_vars, self.ctx._javaAccumulator, class_tag)
     self._jdstream_val = python_dstream.asJavaDStream()
     return self._jdstream_val
Beispiel #10
0
    def poissonVectorRDD(sc,
                         mean,
                         numRows,
                         numCols,
                         numPartitions=None,
                         seed=None):
        """
        Generates an RDD comprised of vectors containing i.i.d. samples drawn
        from the Poisson distribution with the input mean.

        >>> import numpy as np
        >>> mean = 100.0
        >>> rdd = RandomRDDs.poissonVectorRDD(sc, mean, 100, 100, seed=1L)
        >>> mat = np.mat(rdd.collect())
        >>> mat.shape
        (100, 100)
        >>> abs(mat.mean() - mean) < 0.5
        True
        >>> from math import sqrt
        >>> abs(mat.std() - sqrt(mean)) < 0.5
        True
        """
        jrdd = sc._jvm.PythonMLLibAPI() \
            .poissonVectorRDD(sc._jsc, mean, numRows, numCols, numPartitions, seed)
        poisson = RDD(jrdd, sc, NoOpSerializer())
        return poisson.map(
            lambda bytes: _deserialize_double_vector(bytearray(bytes)))
Beispiel #11
0
    def uniformRDD(sc, size, numPartitions=None, seed=None):
        """
        Generates an RDD comprised of i.i.d. samples from the
        uniform distribution U(0.0, 1.0).

        To transform the distribution in the generated RDD from U(0.0, 1.0)
        to U(a, b), use
        C{RandomRDDs.uniformRDD(sc, n, p, seed)\
          .map(lambda v: a + (b - a) * v)}

        >>> x = RandomRDDs.uniformRDD(sc, 100).collect()
        >>> len(x)
        100
        >>> max(x) <= 1.0 and min(x) >= 0.0
        True
        >>> RandomRDDs.uniformRDD(sc, 100, 4).getNumPartitions()
        4
        >>> parts = RandomRDDs.uniformRDD(sc, 100, seed=4).getNumPartitions()
        >>> parts == sc.defaultParallelism
        True
        """
        jrdd = sc._jvm.PythonMLLibAPI().uniformRDD(sc._jsc, size,
                                                   numPartitions, seed)
        uniform = RDD(jrdd, sc, NoOpSerializer())
        return uniform.map(lambda bytes: _deserialize_double(bytearray(bytes)))
Beispiel #12
0
    def createStream(ssc,
                     kinesisAppName,
                     streamName,
                     endpointUrl,
                     regionName,
                     initialPositionInStream,
                     checkpointInterval,
                     storageLevel=StorageLevel.MEMORY_AND_DISK_2,
                     awsAccessKeyId=None,
                     awsSecretKey=None,
                     decoder=utf8_decoder):
        """
        Create an input stream that pulls messages from a Kinesis stream. This uses the
        Kinesis Client Library (KCL) to pull messages from Kinesis.

        Note: The given AWS credentials will get saved in DStream checkpoints if checkpointing is
        enabled. Make sure that your checkpoint directory is secure.

        :param ssc:  StreamingContext object
        :param kinesisAppName:  Kinesis application name used by the Kinesis Client Library (KCL) to
                                update DynamoDB
        :param streamName:  Kinesis stream name
        :param endpointUrl:  Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com)
        :param regionName:  Name of region used by the Kinesis Client Library (KCL) to update
                            DynamoDB (lease coordination and checkpointing) and CloudWatch (metrics)
        :param initialPositionInStream:  In the absence of Kinesis checkpoint info, this is the
                                         worker's initial starting position in the stream. The
                                         values are either the beginning of the stream per Kinesis'
                                         limit of 24 hours (InitialPositionInStream.TRIM_HORIZON) or
                                         the tip of the stream (InitialPositionInStream.LATEST).
        :param checkpointInterval:  Checkpoint interval for Kinesis checkpointing. See the Kinesis
                                    Spark Streaming documentation for more details on the different
                                    types of checkpoints.
        :param storageLevel:  Storage level to use for storing the received objects (default is
                              StorageLevel.MEMORY_AND_DISK_2)
        :param awsAccessKeyId:  AWS AccessKeyId (default is None. If None, will use
                                DefaultAWSCredentialsProviderChain)
        :param awsSecretKey:  AWS SecretKey (default is None. If None, will use
                              DefaultAWSCredentialsProviderChain)
        :param decoder:  A function used to decode value (default is utf8_decoder)
        :return: A DStream object
        """
        jlevel = ssc._sc._getJavaStorageLevel(storageLevel)
        jduration = ssc._jduration(checkpointInterval)

        try:
            # Use KinesisUtilsPythonHelper to access Scala's KinesisUtils
            helperClass = ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader()\
                .loadClass("org.apache.spark.streaming.kinesis.KinesisUtilsPythonHelper")
            helper = helperClass.newInstance()
            jstream = helper.createStream(ssc._jssc, kinesisAppName,
                                          streamName, endpointUrl, regionName,
                                          initialPositionInStream, jduration,
                                          jlevel, awsAccessKeyId, awsSecretKey)
        except Py4JJavaError as e:
            if 'ClassNotFoundException' in str(e.java_exception):
                KinesisUtils._printErrorMsg(ssc.sparkContext)
            raise e
        stream = DStream(jstream, ssc, NoOpSerializer())
        return stream.map(lambda v: decoder(v))
Beispiel #13
0
    def loadLabeledPoints(sc, path, minPartitions=None):
        """
        Load labeled points saved using RDD.saveAsTextFile.

        @param sc: Spark context
        @param path: file or directory path in any Hadoop-supported file
                     system URI
        @param minPartitions: min number of partitions
        @return: labeled data stored as an RDD of LabeledPoint

        >>> from tempfile import NamedTemporaryFile
        >>> from pyspark.mllib.util import MLUtils
        >>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, -1.23), (2, 4.56e-7)])), \
                        LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
        >>> tempFile = NamedTemporaryFile(delete=True)
        >>> tempFile.close()
        >>> sc.parallelize(examples, 1).saveAsTextFile(tempFile.name)
        >>> loaded = MLUtils.loadLabeledPoints(sc, tempFile.name).collect()
        >>> type(loaded[0]) == LabeledPoint
        True
        >>> print examples[0]
        (1.1,(3,[0,2],[-1.23,4.56e-07]))
        >>> type(examples[1]) == LabeledPoint
        True
        >>> print examples[1]
        (0.0,[1.01,2.02,3.03])
        """
        minPartitions = minPartitions or min(sc.defaultParallelism, 2)
        jSerialized = sc._jvm.PythonMLLibAPI().loadLabeledPoints(
            sc._jsc, path, minPartitions)
        serialized = RDD(jSerialized, sc, NoOpSerializer())
        return serialized.map(
            lambda bytes: _deserialize_labeled_point(bytearray(bytes)))
Beispiel #14
0
    def jvertex_rdd(self):
        if self.jvrdd_val:
            return self.jvrdd_val
        if self.bypass_serializer:
            self.jvertex_rdd_deserializer = NoOpSerializer()
        # enable_profile = self.ctx._conf.get("spark.python.profile", "false") == "true"
        # profileStats = self.ctx.accumulator(None, PStatsParam) if enable_profile else None
        command = (self.func, profileStats, self.prev_jvertex_rdd_deserializer,
                   self.jvertex_rdd_deserializer)
        # the serialized command will be compressed by broadcast
        ser = CloudPickleSerializer()
        pickled_command = ser.dumps(command)
        if len(pickled_command) > (1 << 20):  # 1M
            self.broadcast = self.ctx.broadcast(pickled_command)
            pickled_command = ser.dumps(self.broadcast)
        broadcast_vars = ListConverter().convert(
            [x._jbroadcast for x in self.ctx._pickled_broadcast_vars],
            self.ctx._gateway._gateway_client)
        self.ctx._pickled_broadcast_vars.clear()
        env = MapConverter().convert(self.ctx.environment,
                                     self.ctx._gateway._gateway_client)
        includes = ListConverter().convert(self.ctx._python_includes,
                                           self.ctx._gateway._gateway_client)
        java_storage_level = self.ctx._getJavaStorageLevel(
            StorageLevel.MEMORY_ONLY)
        python_rdd = self.ctx._jvm.PythonVertexRDD(
            self.prev_jvertex_rdd, bytearray(pickled_command), env, includes,
            self.preservesPartitioning, self.ctx.pythonExec, broadcast_vars,
            self.ctx._javaAccumulator, java_storage_level)
        self.jvrdd_val = python_rdd.asJavaVertexRDD()

        if enable_profile:
            self.id = self.jvrdd_val.id()
            self.ctx._add_profile(self.id, profileStats)
        return self.jvrdd_val
Beispiel #15
0
    def createStream(ssc, kinesisAppName, streamName, endpointUrl, regionName,
                     initialPositionInStream, checkpointInterval,
                     storageLevel=StorageLevel.MEMORY_AND_DISK_2,
                     awsAccessKeyId=None, awsSecretKey=None, decoder=utf8_decoder,
                     stsAssumeRoleArn=None, stsSessionName=None, stsExternalId=None):
        """
        Create an input stream that pulls messages from a Kinesis stream. This uses the
        Kinesis Client Library (KCL) to pull messages from Kinesis.

        .. note:: The given AWS credentials will get saved in DStream checkpoints if checkpointing
            is enabled. Make sure that your checkpoint directory is secure.

        :param ssc:  StreamingContext object
        :param kinesisAppName:  Kinesis application name used by the Kinesis Client Library (KCL) to
                                update DynamoDB
        :param streamName:  Kinesis stream name
        :param endpointUrl:  Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com)
        :param regionName:  Name of region used by the Kinesis Client Library (KCL) to update
                            DynamoDB (lease coordination and checkpointing) and CloudWatch (metrics)
        :param initialPositionInStream:  In the absence of Kinesis checkpoint info, this is the
                                         worker's initial starting position in the stream. The
                                         values are either the beginning of the stream per Kinesis'
                                         limit of 24 hours (InitialPositionInStream.TRIM_HORIZON) or
                                         the tip of the stream (InitialPositionInStream.LATEST).
        :param checkpointInterval:  Checkpoint interval for Kinesis checkpointing. See the Kinesis
                                    Spark Streaming documentation for more details on the different
                                    types of checkpoints.
        :param storageLevel:  Storage level to use for storing the received objects (default is
                              StorageLevel.MEMORY_AND_DISK_2)
        :param awsAccessKeyId:  AWS AccessKeyId (default is None. If None, will use
                                DefaultAWSCredentialsProviderChain)
        :param awsSecretKey:  AWS SecretKey (default is None. If None, will use
                              DefaultAWSCredentialsProviderChain)
        :param decoder:  A function used to decode value (default is utf8_decoder)
        :param stsAssumeRoleArn: ARN of IAM role to assume when using STS sessions to read from
                                 the Kinesis stream (default is None).
        :param stsSessionName: Name to uniquely identify STS sessions used to read from Kinesis
                               stream, if STS is being used (default is None).
        :param stsExternalId: External ID that can be used to validate against the assumed IAM
                              role's trust policy, if STS is being used (default is None).
        :return: A DStream object
        """
        jlevel = ssc._sc._getJavaStorageLevel(storageLevel)
        jduration = ssc._jduration(checkpointInterval)

        try:
            # Use KinesisUtilsPythonHelper to access Scala's KinesisUtils
            helper = ssc._jvm.org.apache.spark.streaming.kinesis.KinesisUtilsPythonHelper()
        except TypeError as e:
            if str(e) == "'JavaPackage' object is not callable":
                KinesisUtils._printErrorMsg(ssc.sparkContext)
            raise
        jstream = helper.createStream(ssc._jssc, kinesisAppName, streamName, endpointUrl,
                                      regionName, initialPositionInStream, jduration, jlevel,
                                      awsAccessKeyId, awsSecretKey, stsAssumeRoleArn,
                                      stsSessionName, stsExternalId)
        stream = DStream(jstream, ssc, NoOpSerializer())
        return stream.map(lambda v: decoder(v))
Beispiel #16
0
    def binaryRecords(self, path, recordLength):
        """
        Load data from a flat binary file, assuming each record is a set of numbers
        with the specified numerical format (see ByteBuffer), and the number of
        bytes per record is constant.

        :param path: Directory to the input data files
        :param recordLength: The length at which to split the records
        """
        return RDD(self._jsc.binaryRecords(path, recordLength), self, NoOpSerializer())
Beispiel #17
0
def wrap_function(func, profiler=None):
    def pickle_command(command):
        # the serialized command will be compressed by broadcast
        ser = CloudPickleSerializer()
        pickled_command = ser.dumps(command)
        return pickled_command

    ser = AutoBatchedSerializer(PickleSerializer())
    command = (func, profiler, NoOpSerializer(), ser)
    pickled_command = pickle_command(command)
    return bytearray(pickled_command)
Beispiel #18
0
    def binaryRecords(self, path, recordLength):
        """
        .. note:: Experimental

        Load data from a flat binary file, assuming each record is a set of numbers
        with the specified numerical format (see ByteBuffer), and the number of
        bytes per record is constant.
		从平面二进制文件中加载数据,假设每个记录都是使用指定的数值格式的一个数据集,切每条记录的字节数是常量。

        :param path: Directory to the input data files输入文件的目录
        :param recordLength: The length at which to split the records分割记录的长度
        """
        return RDD(self._jsc.binaryRecords(path, recordLength), self, NoOpSerializer())
Beispiel #19
0
    def binaryRecordsStream(self, directory, recordLength):
        """
        Create an input stream that monitors a Hadoop-compatible file system
        for new files and reads them as flat binary files with records of
        fixed length. Files must be written to the monitored directory by "moving"
        them from another location within the same file system.
        File names starting with . are ignored.

        @param directory:       Directory to load data from
        @param recordLength:    Length of each record in bytes
        """
        return DStream(self._jssc.binaryRecordsStream(directory, recordLength), self,
                       NoOpSerializer())
Beispiel #20
0
    def binaryFiles(self, path, minPartitions=None):
        """
        Read a directory of binary files from HDFS, a local file system
        (available on all nodes), or any Hadoop-supported file system URI
        as a byte array. Each file is read as a single record and returned
        in a key-value pair, where the key is the path of each file, the
        value is the content of each file.

        .. note:: Small files are preferred, large file is also allowable, but
            may cause bad performance.
        """
        minPartitions = minPartitions or self.defaultMinPartitions
        return RDD(self._jsc.binaryFiles(path, minPartitions), self,
                   PairDeserializer(UTF8Deserializer(), NoOpSerializer()))
Beispiel #21
0
    def __init__(self, ctx, resource_read=None, query=None, **kwargs):
        kwargs = make_es_config(kwargs,
                                resource_read=resource_read,
                                query=query)
        kwargs = as_java_object(ctx._gateway, kwargs)
        jrdd = helper(ctx).esJsonRDD(ctx._jsc, kwargs)
        rdd = RDD(jrdd, ctx, NoOpSerializer())

        # read the rdd in batches of two (first key then value / doc)
        def pairwise(iterable):
            iterator = iter(iterable)
            return izip(iterator, iterator)

        kvRdd = rdd.mapPartitions(pairwise, True)

        super(EsRDD, self).__init__(kvRdd._jrdd, ctx)
Beispiel #22
0
    def createStream(ssc, hostname, port, queueName, threadNum,
                     storageLevel=StorageLevel.MEMORY_AND_DISK_2,
                     decoder=utf8_decoder):
        """
        Create an input stream that pulls events from Memcache.

        :param ssc:  StreamingContext object
        :param hostname:  Hostname of the slave machine to which the flume data will be sent
        :param port:  Port of the slave machine to which the flume data will be sent
        :param storageLevel:  Storage level to use for storing the received objects
        :return: A DStream object
        """
        jlevel = ssc._sc._getJavaStorageLevel(storageLevel)
        helper = MQUtils._get_helper(ssc._sc)
        jstream = helper.createStream(ssc._jssc, hostname, port, queueName, threadNum, jlevel)
        stream = DStream(jstream, ssc, NoOpSerializer())
        return stream.map(lambda v: decoder(v))
Beispiel #23
0
    def uniformVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None):
        """
        Generates an RDD comprised of vectors containing i.i.d. samples drawn
        from the uniform distribution U(0.0, 1.0).

        >>> import numpy as np
        >>> mat = np.matrix(RandomRDDs.uniformVectorRDD(sc, 10, 10).collect())
        >>> mat.shape
        (10, 10)
        >>> mat.max() <= 1.0 and mat.min() >= 0.0
        True
        >>> RandomRDDs.uniformVectorRDD(sc, 10, 10, 4).getNumPartitions()
        4
        """
        jrdd = sc._jvm.PythonMLLibAPI() \
            .uniformVectorRDD(sc._jsc, numRows, numCols, numPartitions, seed)
        uniform = RDD(jrdd, sc, NoOpSerializer())
        return uniform.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))
Beispiel #24
0
    def binaryFiles(self, path, minPartitions=None):
        """
        .. note:: Experimental

        Read a directory of binary files from HDFS, a local file system
        (available on all nodes), or any Hadoop-supported file system URI
        as a byte array. Each file is read as a single record and returned
        in a key-value pair, where the key is the path of each file, the
        value is the content of each file.
		从HDFS、或者本地文件系统(所有节点可用)、或任何hadoop支持的文件系统URI,读取一个二进制文件的目录。
		每个文件都被读取为单个记录,并以键-值对返回,其中键是每个文件的路径, value是每个文件的内容。

        .. note:: Small files are preferred, large file is also allowable, but
            may cause bad performance.
        """
        minPartitions = minPartitions or self.defaultMinPartitions#获取最小的分区数
        return RDD(self._jsc.binaryFiles(path, minPartitions), self,
                   PairDeserializer(UTF8Deserializer(), NoOpSerializer()))
Beispiel #25
0
    def normalVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None):
        """
        Generates an RDD comprised of vectors containing i.i.d. samples drawn
        from the standard normal distribution.

        >>> import numpy as np
        >>> mat = np.matrix(RandomRDDs.normalVectorRDD(sc, 100, 100, seed=1L).collect())
        >>> mat.shape
        (100, 100)
        >>> abs(mat.mean() - 0.0) < 0.1
        True
        >>> abs(mat.std() - 1.0) < 0.1
        True
        """
        jrdd = sc._jvm.PythonMLLibAPI() \
            .normalVectorRDD(sc._jsc, numRows, numCols, numPartitions, seed)
        normal = RDD(jrdd, sc, NoOpSerializer())
        return normal.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))
    def createStream(ssc, arn, destination, awsRegion, awsKey, awsKecret,
                     initialPosition, storageLevel, maxRecord, interval):
        sc = ssc.sparkContext
        java_import(
            sc._gateway.jvm,
            "org.apache.spark.streaming.kinesis.DynamoDBStream.DynamoDBPythonUtils"
        )
        DynamoDBPythonUtils = sc._gateway.jvm.DynamoDBPythonUtils

        jlevel = ssc._sc._getJavaStorageLevel(storageLevel)
        jstream = DynamoDBPythonUtils.DynamoDBStream(ssc._jssc, arn,
                                                     destination, awsRegion,
                                                     awsKey, awsKecret,
                                                     initialPosition, jlevel,
                                                     maxRecord, interval)

        stream = DStream(jstream, ssc, NoOpSerializer())
        return stream.map(lambda record: record.decode("utf-8")
                          if record is not None else None)
Beispiel #27
0
    def poissonRDD(sc, mean, size, numPartitions=None, seed=None):
        """
        Generates an RDD comprised of i.i.d. samples from the Poisson
        distribution with the input mean.

        >>> mean = 100.0
        >>> x = RandomRDDs.poissonRDD(sc, mean, 1000, seed=1L)
        >>> stats = x.stats()
        >>> stats.count()
        1000L
        >>> abs(stats.mean() - mean) < 0.5
        True
        >>> from math import sqrt
        >>> abs(stats.stdev() - sqrt(mean)) < 0.5
        True
        """
        jrdd = sc._jvm.PythonMLLibAPI().poissonRDD(sc._jsc, mean, size, numPartitions, seed)
        poisson = RDD(jrdd, sc, NoOpSerializer())
        return poisson.map(lambda bytes: _deserialize_double(bytearray(bytes)))
Beispiel #28
0
 def predict(self, x):
     """
     Predict the label of one or more examples.
     :param x:  Data point (feature vector),
                or an RDD of data points (feature vectors).
     """
     pythonAPI = self._sc._jvm.PythonMLLibAPI()
     if isinstance(x, RDD):
         # Bulk prediction
         if x.count() == 0:
             return self._sc.parallelize([])
         dataBytes = _get_unmangled_double_vector_rdd(x, cache=False)
         jSerializedPreds = \
             pythonAPI.predictDecisionTreeModel(self._java_model,
                                                dataBytes._jrdd)
         serializedPreds = RDD(jSerializedPreds, self._sc, NoOpSerializer())
         return serializedPreds.map(lambda bytes: _deserialize_double(bytearray(bytes)))
     else:
         # Assume x is a single data point.
         x_ = _serialize_double_vector(x)
         return pythonAPI.predictDecisionTreeModel(self._java_model, x_)
Beispiel #29
0
    def normalRDD(sc, size, numPartitions=None, seed=None):
        """
        Generates an RDD comprised of i.i.d. samples from the standard normal
        distribution.

        To transform the distribution in the generated RDD from standard normal
        to some other normal N(mean, sigma^2), use
        C{RandomRDDs.normal(sc, n, p, seed)\
          .map(lambda v: mean + sigma * v)}

        >>> x = RandomRDDs.normalRDD(sc, 1000, seed=1L)
        >>> stats = x.stats()
        >>> stats.count()
        1000L
        >>> abs(stats.mean() - 0.0) < 0.1
        True
        >>> abs(stats.stdev() - 1.0) < 0.1
        True
        """
        jrdd = sc._jvm.PythonMLLibAPI().normalRDD(sc._jsc, size, numPartitions, seed)
        normal = RDD(jrdd, sc, NoOpSerializer())
        return normal.map(lambda bytes: _deserialize_double(bytearray(bytes)))
Beispiel #30
0
 def _jrdd(self):
     if self._jrdd_val:
         return self._jrdd_val
     if self._bypass_serializer:
         serializer = NoOpSerializer()
     else:
         serializer = self.ctx.serializer
     command = (self.func, self._prev_jrdd_deserializer, serializer)
     pickled_command = CloudPickleSerializer().dumps(command)
     broadcast_vars = ListConverter().convert(
         [x._jbroadcast for x in self.ctx._pickled_broadcast_vars],
         self.ctx._gateway._gateway_client)
     self.ctx._pickled_broadcast_vars.clear()
     class_tag = self._prev_jrdd.classTag()
     env = MapConverter().convert(self.ctx.environment,
                                  self.ctx._gateway._gateway_client)
     includes = ListConverter().convert(self.ctx._python_includes,
                                  self.ctx._gateway._gateway_client)
     python_rdd = self.ctx._jvm.PythonRDD(self._prev_jrdd.rdd(),
         bytearray(pickled_command), env, includes, self.preservesPartitioning,
         self.ctx.pythonExec, broadcast_vars, self.ctx._javaAccumulator,
         class_tag)
     self._jrdd_val = python_rdd.asJavaRDD()
     return self._jrdd_val