Esempio n. 1
0
    def union(self, *dstreams: "DStream[T]") -> "DStream[T]":
        """
        Create a unified DStream from multiple DStreams of the same
        type and same slide duration.
        """
        if not dstreams:
            raise ValueError("should have at least one DStream to union")
        if len(dstreams) == 1:
            return dstreams[0]
        if len(set(s._jrdd_deserializer for s in dstreams)) > 1:
            raise ValueError("All DStreams should have same serializer")
        if len(set(s._slideDuration for s in dstreams)) > 1:
            raise ValueError("All DStreams should have same slide duration")

        assert SparkContext._jvm is not None
        jdstream_cls = SparkContext._jvm.org.apache.spark.streaming.api.java.JavaDStream
        jpair_dstream_cls = SparkContext._jvm.org.apache.spark.streaming.api.java.JavaPairDStream
        gw = SparkContext._gateway
        if is_instance_of(gw, dstreams[0]._jdstream, jdstream_cls):
            cls = jdstream_cls
        elif is_instance_of(gw, dstreams[0]._jdstream, jpair_dstream_cls):
            cls = jpair_dstream_cls
        else:
            cls_name = dstreams[0]._jdstream.getClass().getCanonicalName()
            raise TypeError("Unsupported Java DStream class %s" % cls_name)

        assert gw is not None
        jdstreams = gw.new_array(cls, len(dstreams))
        for i in range(0, len(dstreams)):
            jdstreams[i] = dstreams[i]._jdstream
        return DStream(
            self._jssc.union(jdstreams),
            self,
            dstreams[0]._jrdd_deserializer,
        )
Esempio n. 2
0
def joinWithCassandraTable(dstream, keyspace, table, selected_columns=None,
                           join_columns=None):
    """Joins a DStream (a stream of RDDs) with a Cassandra table

    Arguments:
        @param dstream(DStream)
        The DStream to join. Equals to self when invoking
        joinWithCassandraTable on a monkey patched RDD.
        @param keyspace(string):
            The keyspace to join on.
        @param table(string):
            The CQL table to join on.
        @param selected_columns(string):
            The columns to select from the Cassandra table.
        @param join_columns(string):
            The columns used to join on from the Cassandra table.
    """

    ssc = dstream._ssc
    ctx = ssc._sc
    gw = ctx._gateway

    selected_columns = as_java_array(
        gw, "String", selected_columns) if selected_columns else None
    join_columns = as_java_array(gw, "String",
                                 join_columns) if join_columns else None

    h = helper(ctx)
    dstream = h.joinWithCassandraTable(dstream._jdstream, keyspace, table,
                                       selected_columns,
                                       join_columns)
    dstream = h.pickleRows(dstream)
    dstream = h.javaDStream(dstream)

    return DStream(dstream, ssc, AutoBatchedSerializer(PickleSerializer()))
Esempio n. 3
0
    def queueStream(self, rdds, oneAtATime=True, default=None):
        """
        Create an input stream from an queue of RDDs or list. In each batch,
        it will process either one or all of the RDDs returned by the queue.

        .. note:: Changes to the queue after the stream is created will not be recognized.

        @param rdds:       Queue of RDDs
        @param oneAtATime: pick one rdd each time or pick all of them once.
        @param default:    The default rdd if no more in rdds
        """
        if default and not isinstance(default, RDD):
            default = self._sc.parallelize(default)

        if not rdds and default:
            rdds = [rdds]

        if rdds and not isinstance(rdds[0], RDD):
            rdds = [self._sc.parallelize(input) for input in rdds]
        self._check_serializers(rdds)

        queue = self._jvm.PythonDStream.toRDDQueue([r._jrdd for r in rdds])
        if default:
            default = default._reserialize(rdds[0]._jrdd_deserializer)
            jdstream = self._jssc.queueStream(queue, oneAtATime, default._jrdd)
        else:
            jdstream = self._jssc.queueStream(queue, oneAtATime)
        return DStream(jdstream, self, rdds[0]._jrdd_deserializer)
Esempio n. 4
0
 def socketTextStream(self, hostname, port):
     """
     Create an input from TCP source hostname:port. Data is received using
     a TCP socket and receive byte is interpreted as UTF8 encoded '\n' delimited
     lines.
     """
     return DStream(self._jssc.socketTextStream(hostname, port), self,
                    UTF8Deserializer())
Esempio n. 5
0
 def textFileStream(self, directory):
     """
     Create an input stream that monitors a Hadoop-compatible file system
     for new files and reads them as text files. Files must be wrriten to the
     monitored directory by "moving" them from another location within the same
     file system. File names starting with . are ignored.
     """
     return DStream(self._jssc.textFileStream(directory), self, UTF8Deserializer())
Esempio n. 6
0
    def binaryRecordsStream(self, directory, recordLength):
        """
        Create an input stream that monitors a Hadoop-compatible file system
        for new files and reads them as flat binary files with records of
        fixed length. Files must be written to the monitored directory by "moving"
        them from another location within the same file system.
        File names starting with . are ignored.

        @param directory:       Directory to load data from
        @param recordLength:    Length of each record in bytes
        """
        return DStream(self._jssc.binaryRecordsStream(directory, recordLength), self,
                       NoOpSerializer())
Esempio n. 7
0
    def socketTextStream(self, hostname, port, storageLevel=StorageLevel.MEMORY_AND_DISK_2):
        """
        Create an input from TCP source hostname:port. Data is received using
        a TCP socket and receive byte is interpreted as UTF8 encoded ``\\n`` delimited
        lines.

        @param hostname:      Hostname to connect to for receiving data
        @param port:          Port to connect to for receiving data
        @param storageLevel:  Storage level to use for storing the received objects
        """
        jlevel = self._sc._getJavaStorageLevel(storageLevel)
        return DStream(self._jssc.socketTextStream(hostname, port, jlevel), self,
                       UTF8Deserializer())
Esempio n. 8
0
    def _loadBytesDStream(self, datasetId=DATA_KEY):
        """
        """
        jvm = self._sc._jvm
        java_import(jvm, "thunder_streaming.receivers.*")

        feeder_conf = self._feeder.conf
        ser = PairDeserializer(NoOpSerializer(), NoOpSerializer())

        try:
            # TODO: are there closure problems with this approach? (why do Jascha/KafkaUtils do it differently?)
            dstream = DStream(
                self.ssc._jssc.receiverStream(jvm.HBaseReceiver(
                    ListConverter().convert(feeder_conf.get_sequence_names(), jvm._gateway_client),
                    settings.BASE_COL_FAM,
                    datasetId,
                    settings.MAX_KEY,
                    self.batch_time)),
                self.ssc, ser)
            return dstream.map(lambda kv: (kv[0], bytes(kv[1])))
        except Py4JJavaError as e:
            print "Could not create the synchronized DStream."
            raise e
Esempio n. 9
0
 def transform(self, dstreams, transformFunc):
     """
     Create a new DStream in which each RDD is generated by applying
     a function on RDDs of the DStreams. The order of the JavaRDDs in
     the transform function parameter will be the same as the order
     of corresponding DStreams in the list.
     """
     jdstreams = [d._jdstream for d in dstreams]
     # change the final serializer to sc.serializer
     func = TransformFunction(self._sc,
                              lambda t, *rdds: transformFunc(rdds),
                              *[d._jrdd_deserializer for d in dstreams])
     jfunc = self._jvm.TransformFunction(func)
     jdstream = self._jssc.transform(jdstreams, jfunc)
     return DStream(jdstream, self, self._sc.serializer)
Esempio n. 10
0
 def union(self, *dstreams):
     """
     Create a unified DStream from multiple DStreams of the same
     type and same slide duration.
     """
     if not dstreams:
         raise ValueError("should have at least one DStream to union")
     if len(dstreams) == 1:
         return dstreams[0]
     if len(set(s._jrdd_deserializer for s in dstreams)) > 1:
         raise ValueError("All DStreams should have same serializer")
     if len(set(s._slideDuration for s in dstreams)) > 1:
         raise ValueError("All DStreams should have same slide duration")
     first = dstreams[0]
     jrest = [d._jdstream for d in dstreams[1:]]
     return DStream(self._jssc.union(first._jdstream, jrest), self, first._jrdd_deserializer)
 def union(self, *dstreams):
     """
     Create a unified DStream from multiple DStreams of the same
     type and same slide duration.
     """
     if not dstreams:
         raise ValueError("should have at least one DStream to union")
     if len(dstreams) == 1:
         return dstreams[0]
     if len(set(s._jrdd_deserializer for s in dstreams)) > 1:
         raise ValueError("All DStreams should have same serializer")
     if len(set(s._slideDuration for s in dstreams)) > 1:
         raise ValueError("All DStreams should have same slide duration")
     cls = SparkContext._jvm.org.apache.spark.streaming.api.java.JavaDStream
     jdstreams = SparkContext._gateway.new_array(cls, len(dstreams))
     for i in range(0, len(dstreams)):
         jdstreams[i] = dstreams[i]._jdstream
     return DStream(self._jssc.union(jdstreams), self, dstreams[0]._jrdd_deserializer)
Esempio n. 12
0
    def queueStream(
        self,
        rdds: List[RDD[T]],
        oneAtATime: bool = True,
        default: Optional[RDD[T]] = None,
    ) -> "DStream[T]":
        """
        Create an input stream from a queue of RDDs or list. In each batch,
        it will process either one or all of the RDDs returned by the queue.

        Parameters
        ----------
        rdds : list
            Queue of RDDs
        oneAtATime : bool, optional
            pick one rdd each time or pick all of them once.
        default : :class:`pyspark.RDD`, optional
            The default rdd if no more in rdds

        Notes
        -----
        Changes to the queue after the stream is created will not be recognized.
        """
        if default and not isinstance(default, RDD):
            default = self._sc.parallelize(default)  # type: ignore[arg-type]

        if not rdds and default:
            rdds = [rdds]  # type: ignore[list-item]

        if rdds and not isinstance(rdds[0], RDD):
            rdds = [self._sc.parallelize(input)
                    for input in rdds]  # type: ignore[arg-type]
        self._check_serializers(rdds)

        assert self._jvm is not None
        queue = self._jvm.PythonDStream.toRDDQueue([r._jrdd for r in rdds])
        if default:
            default = default._reserialize(rdds[0]._jrdd_deserializer)
            assert default is not None
            jdstream = self._jssc.queueStream(queue, oneAtATime, default._jrdd)
        else:
            jdstream = self._jssc.queueStream(queue, oneAtATime)
        return DStream(jdstream, self, rdds[0]._jrdd_deserializer)
Esempio n. 13
0
    def _testInputStream(self, test_inputs, numSlices=None):
        """
        This function is only for unittest.
        It requires a list as input, and returns the i_th element at the i_th batch
        under manual clock.
        """
        test_rdds = list()
        test_rdd_deserializers = list()
        for test_input in test_inputs:
            test_rdd = self._sc.parallelize(test_input, numSlices)
            test_rdds.append(test_rdd._jrdd)
            test_rdd_deserializers.append(test_rdd._jrdd_deserializer)
        # All deserializers have to be the same.
        # TODO: add deserializer validation
        jtest_rdds = ListConverter().convert(
            test_rdds, SparkContext._gateway._gateway_client)
        jinput_stream = self._jvm.PythonTestInputStream(
            self._jssc, jtest_rdds).asJavaDStream()

        return DStream(jinput_stream, self, test_rdd_deserializers[0])
Esempio n. 14
0
    def transform(self, dstreams: List["DStream[Any]"],
                  transformFunc: Callable[..., RDD[T]]) -> "DStream[T]":
        """
        Create a new DStream in which each RDD is generated by applying
        a function on RDDs of the DStreams. The order of the JavaRDDs in
        the transform function parameter will be the same as the order
        of corresponding DStreams in the list.
        """
        jdstreams = [d._jdstream
                     for d in dstreams]  # type: ignore[attr-defined]
        # change the final serializer to sc.serializer
        func = TransformFunction(
            self._sc,
            lambda t, *rdds: transformFunc(rdds),
            *[d._jrdd_deserializer
              for d in dstreams],  # type: ignore[attr-defined]
        )

        assert self._jvm is not None
        jfunc = self._jvm.TransformFunction(func)
        jdstream = self._jssc.transform(jdstreams, jfunc)
        return DStream(jdstream, self, self._sc.serializer)
Esempio n. 15
0
    def socketTextStream(
        self,
        hostname: str,
        port: int,
        storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_2
    ) -> "DStream[str]":
        """
        Create an input from TCP source hostname:port. Data is received using
        a TCP socket and receive byte is interpreted as UTF8 encoded ``\\n`` delimited
        lines.

        Parameters
        ----------
        hostname : str
            Hostname to connect to for receiving data
        port : int
            Port to connect to for receiving data
        storageLevel : :class:`pyspark.StorageLevel`, optional
            Storage level to use for storing the received objects
        """
        jlevel = self._sc._getJavaStorageLevel(storageLevel)
        return DStream(self._jssc.socketTextStream(hostname, port, jlevel),
                       self, UTF8Deserializer())
Esempio n. 16
0
    def __init__(self, jdstream, ssc, jrdd_deserializer, schema):
        DStream.__init__(self, jdstream, ssc, jrdd_deserializer)

        self._schema = schema
        self._sqlcontext = SnappyContext(self._sc)
Esempio n. 17
0
    def __init__(self, jdstream, ssc, jrdd_deserializer, schema):
        DStream.__init__(self, jdstream, ssc, jrdd_deserializer)

        self._schema = schema
        self._sqlcontext = SnappyContext(self._sc)
Esempio n. 18
0
    def __init__(self, jdstream, ssc, jrdd_deserializer, schema):
        DStream.__init__(self, jdstream, ssc, jrdd_deserializer)

        self._schema = schema
        self._snappySession = SnappySession(self._sc)