Example #1
0
 def create_table_using_sql(self, ddl, provider):
     sparkSession = SnappySession(self.sc)
     schema = StructType().add("col1", IntegerType()).add("col2", IntegerType()).add("col3", IntegerType())
     input = SnappyContextTests.testdata
     dataDF = sparkSession.createDataFrame(input, schema)
     sparkSession.sql("DROP TABLE IF EXISTS " + SnappyContextTests.tablename)
     sparkSession.sql(ddl)
     dataDF.write.insertInto(SnappyContextTests.tablename)
Example #2
0
 def create_table_using_sql(self, ddl, provider):
     sparkSession = SnappySession(self.sc)
     dataDF = sparkSession._sc.parallelize(SnappyContextTests.testdata,
                                           5).toDF()
     sparkSession.sql("DROP TABLE IF EXISTS " +
                      SnappyContextTests.tablename)
     sparkSession.sql(ddl)
     dataDF.write.insertInto(SnappyContextTests.tablename)
Example #3
0
 def create_table_using_sql(self, ddl, provider):
     sparkSession = SnappySession(self.sc)
     dataDF = sparkSession._sc.parallelize(SnappyContextTests.testdata, 5).toDF()
     sparkSession.sql("DROP TABLE IF EXISTS " + SnappyContextTests.tablename)
     sparkSession.sql(ddl)
     dataDF.write.insertInto(SnappyContextTests.tablename)
Example #4
0
 def verify_table_rows(self, rowcount):
     sparkSession = SnappySession(self.sc)
     result = sparkSession.sql("SELECT COUNT(*) FROM " + SnappyContextTests.tablename).collect()
     self.assertTrue(result[0][0] == rowcount)
Example #5
0
 def verify_table_rows(self, rowcount):
     sparkSession = SnappySession(self.sc)
     result = sparkSession.sql("SELECT COUNT(*) FROM " + SnappyContextTests.tablename).collect()
     self.assertTrue(result[0][0] == rowcount)
Example #6
0
class SnappyStreamingContext(StreamingContext):
    """
    Main entry point for Snappy Spark Streaming functionality. A SnappyStreamingContext
    represents the connection to a Snappy cluster, and can be used to create
    L{DStream} various input sources. It can be from an existing L{SparkContext}.
    After creating and transforming DStreams, the streaming computation can
    be started and stopped using `context.start()` and `context.stop()`,
    respectively. `context.awaitTermination()` allows the current thread
    to wait for the termination of the context by `stop()` or by an exception.
    """
    _transformerSerializer = None

    def __init__(self, sparkContext, batchDuration=None, jssc=None):
        """
        Create a new StreamingContext.

        @param sparkContext: L{SparkContext} object.
        @param batchDuration: the time interval (in seconds) at which streaming
                              data will be divided into batches
        """

        self._sc = sparkContext
        self._jvm = self._sc._jvm
        self._jssc = jssc or self._initialize_context(self._sc, batchDuration)
        self._snappySession = SnappySession(sparkContext)
        # self._snappycontext = SnappyContext(sparkContext, snappySession)

    @classmethod
    def _ensure_initialized(cls):
        SparkContext._ensure_initialized()
        gw = SparkContext._gateway

        java_import(gw.jvm, "org.apache.spark.streaming.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.java.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.python.*")

        # start callback server
        # getattr will fallback to JVM, so we cannot test by hasattr()
        if "_callback_server" not in gw.__dict__ or gw._callback_server is None:
            gw.callback_server_parameters.eager_load = True
            gw.callback_server_parameters.daemonize = True
            gw.callback_server_parameters.daemonize_connections = True
            gw.callback_server_parameters.port = 0
            gw.start_callback_server(gw.callback_server_parameters)
            cbport = gw._callback_server.server_socket.getsockname()[1]
            gw._callback_server.port = cbport
            # gateway with real port
            gw._python_proxy_port = gw._callback_server.port
            # get the GatewayServer object in JVM by ID
            jgws = JavaObject("GATEWAY_SERVER", gw._gateway_client)
            # update the port of CallbackClient with real port
            jgws.resetCallbackClient(jgws.getCallbackClient().getAddress(), gw._python_proxy_port)

        # register serializer for TransformFunction
        # it happens before creating SparkContext when loading from checkpointing
        cls._transformerSerializer = TransformFunctionSerializer(
            SparkContext._active_spark_context, CloudPickleSerializer(), gw)


    def _initialize_context(self, sc, duration):
        self._ensure_initialized()
        return self._jvm.JavaSnappyStreamingContext(sc._jsc, self._jduration(duration))

    @classmethod
    def getOrCreate(cls, checkpointPath, setupFunc):
        """
        Either recreate a SnappyStreamingContext from checkpoint data or create a new SnappyStreamingContext.
        If checkpoint data exists in the provided `checkpointPath`, then SnappyStreamingContext will be
        recreated from the checkpoint data. If the data does not exist, then the provided setupFunc
        will be used to create a new context.

        @param checkpointPath: Checkpoint directory used in an earlier streaming program
        @param setupFunc:      Function to create a new context and setup DStreams
        """
        cls._ensure_initialized()
        gw = SparkContext._gateway

        # Check whether valid checkpoint information exists in the given path
        ssc_option = gw.jvm.SnappyStreamingContextPythonHelper().tryRecoverFromCheckpoint(checkpointPath)
        if ssc_option.isEmpty():
            ssc = setupFunc()
            ssc.checkpoint(checkpointPath)
            return ssc

        jssc = gw.jvm.JavaSnappyStreamingContext(ssc_option.get())

        # If there is already an active instance of Python SparkContext use it, or create a new one
        if not SparkContext._active_spark_context:
            jsc = jssc.sparkContext()
            conf = SparkConf(_jconf=jsc.getConf())
            SparkContext(conf=conf, gateway=gw, jsc=jsc)

        sc = SparkContext._active_spark_context

        # update ctx in serializer
        cls._transformerSerializer.ctx = sc
        return SnappyStreamingContext(sc, None, jssc)

    @classmethod
    def getActive(cls):
        """
        Return either the currently active SnappyStreamingContext (i.e., if there is a context started
        but not stopped) or None.
        """
        activePythonContext = cls._activeContext
        if activePythonContext is not None:
            # Verify that the current running Java StreamingContext is active and is the same one
            # backing the supposedly active Python context
            activePythonContextJavaId = activePythonContext._jssc.ssc().hashCode()
            activeJvmContextOption = activePythonContext._jvm.SnappyStreamingContext.getActive()

            if activeJvmContextOption.isEmpty():
                cls._activeContext = None
            elif activeJvmContextOption.get().hashCode() != activePythonContextJavaId:
                cls._activeContext = None
                raise Exception("JVM's active JavaStreamingContext is not the JavaStreamingContext "
                                "backing the action Python StreamingContext. This is unexpected.")
        return cls._activeContext

    def start(self):
        """
        Start the execution of the streams.
        """
        self._jssc.start()
        SnappyStreamingContext._activeContext = self


    def sql(self ,  sqlText):
        """Returns a :class:`DataFrame` representing the result of the given query.
        :return: :class:`DataFrame`
        """
        return self._snappySession.sql(sqlText)

    def union(self, *dstreams):
        """
        Create a unified DStream from multiple DStreams of the same
        type and same slide duration.
        """
        if not dstreams:
            raise ValueError("should have at least one DStream to union")
        if len(dstreams) == 1:
            return dstreams[0]
        if len(set(s._jrdd_deserializer for s in dstreams)) > 1:
            raise ValueError("All DStreams should have same serializer")
        if len(set(s._slideDuration for s in dstreams)) > 1:
            raise ValueError("All DStreams should have same slide duration")
        first = dstreams[0]
        jrest = [d._jdstream for d in dstreams[1:]]
        return DStream(self._jssc.union(first._jdstream, jrest), self, first._jrdd_deserializer)

    def createSchemaDStream(self, dstream , schema):
        """
        Creates a [[SchemaDStream]] from an DStream of Product"
        """
        if not isinstance(schema, StructType):
            raise TypeError("schema should be StructType, but got %s" % type(schema))
        if not isinstance(dstream, DStream):
            raise TypeError("dstream should be DStream, but got %s" % type(dstream))
        return SchemaDStream(dstream._jdstream, self, dstream._jrdd_deserializer, schema)