def __init__(self, sparkContext, batchDuration=None, jssc=None): """ Create a new StreamingContext. @param sparkContext: L{SparkContext} object. @param batchDuration: the time interval (in seconds) at which streaming data will be divided into batches """ self._sc = sparkContext self._jvm = self._sc._jvm self._jssc = jssc or self._initialize_context(self._sc, batchDuration) self._snappycontext = SnappyContext(sparkContext)
def test_delete(self): self.drop_table(True) self.create_table_using_datasource("row") sqlcontext = SnappyContext.getOrCreate(self.sc) self.assertTrue( sqlcontext.delete(SnappyContextTests.tablename, "col1=1"), 2) self.drop_table()
class SchemaDStream(DStream): """ A SQL based DStream with support for schema/Product This class offers the ability to manipulate SQL query on DStreams It is similar to SchemaRDD, which offers the similar functions Internally, RDD of each batch duration is treated as a small table and CQs are evaluated on those small tables Some of the abstraction and code is borrowed from the project: https://github.com/Intel-bigdata/spark-streamingsql @param snsc @param queryExecution """ def __init__(self, jdstream, ssc, jrdd_deserializer, schema): DStream.__init__(self, jdstream, ssc, jrdd_deserializer) self._schema = schema self._sqlcontext = SnappyContext(self._sc) def foreachDataFrame(self, func): def createDataFrame(_, rdd): df = self._sqlcontext.createDataFrame(rdd, self._schema) func(df) self.foreachRDD(createDataFrame)
def insert_table(self): sqlcontext = SnappyContext.getOrCreate(self.sc) newrow = [1L, 2L, 3L], [2L, 3L, 4L] sqlcontext.insert(SnappyContextTests.tablename, newrow) self.verify_table_rows(7) newrow = [1L, 2L, 3L] sqlcontext.insert(SnappyContextTests.tablename , newrow) self.verify_table_rows(8)
def create_table_using_datasource(self, provider, schemaddl=False): sqlcontext = SnappyContext.getOrCreate(self.sc) df = sqlcontext._sc.parallelize(SnappyContextTests.testdata, 5).toDF(["COL1", "COL2", "COL3"]) if schemaddl is False: sqlcontext.createTable(SnappyContextTests.tablename, provider, df.schema) else: sqlcontext.createTable(SnappyContextTests.tablename, provider, "(COL1 INT , COL2 INT , COL3 INT)") df.write.format("row").mode("append").saveAsTable(SnappyContextTests.tablename)
def create_table_using_sql(self, ddl, provider): sqlcontext = SnappyContext.getOrCreate(self.sc) dataDF = sqlcontext._sc.parallelize(SnappyContextTests.testdata, 5).toDF() sqlcontext.sql("DROP TABLE IF EXISTS " + SnappyContextTests.tablename) sqlcontext.sql(ddl) dataDF.write.format(provider).mode("append").saveAsTable( SnappyContextTests.tablename)
def insert_table(self): sqlcontext = SnappyContext.getOrCreate(self.sc) newrow = [1L, 2L, 3L], [2L, 3L, 4L] sqlcontext.insert(SnappyContextTests.tablename, newrow) self.verify_table_rows(7) newrow = [1L, 2L, 3L] sqlcontext.insert(SnappyContextTests.tablename, newrow) self.verify_table_rows(8)
def test_new_session(self): sqlcontext1 = SnappyContext.getOrCreate(self.sc) sqlcontext1.setConf("test_key", "a") sqlcontext2 = sqlcontext1.newSession() sqlcontext2.setConf("test_key", "b") self.assertEqual(sqlcontext1.getConf("test_key", ""), "a") self.assertEqual(sqlcontext2.getConf("test_key", ""), "b")
def create_table_using_datasource(self, provider, schemaddl=False): sqlcontext = SnappyContext.getOrCreate(self.sc) df = sqlcontext._sc.parallelize(SnappyContextTests.testdata, 5).toDF(["COL1", "COL2", "COL3"]) if schemaddl is False: sqlcontext.createTable(SnappyContextTests.tablename, provider, df.schema) else: sqlcontext.createTable(SnappyContextTests.tablename, provider, "(COL1 INT , COL2 INT , COL3 INT)") df.write.format("row").mode("append").saveAsTable( SnappyContextTests.tablename)
def __init__(self, sparkContext, jsparkSession=None): """Creates a new SnappySession. """ self._sc = sparkContext self._jsc = self._sc._jsc self._jvm = self._sc._jvm SparkSession.__init__(self, sparkContext) if jsparkSession is None: jsparkSession = self._jvm.SnappySession(self._jsc.sc()) from pyspark.sql.snappy import SnappyContext self._wrapped = SnappyContext(self._sc, jsparkSession) self._jsparkSession = jsparkSession
def put_table(self): sqlcontext = SnappyContext(self.sc) newrow = [1L, 2L, 3L], [2L, 3L, 4L] sqlcontext.put(SnappyContextTests.tablename, newrow) self.verify_table_rows(7) newrow = [1L, 2L, 3L] sqlcontext.put(SnappyContextTests.tablename , newrow) self.verify_table_rows(8)
def test_schema_dstream(self): rdd = [ self.sc.parallelize([(127, -128, -32768, 32767, 2147483647, 1.0, date(2010, 1, 1), datetime(2010, 1, 1, 1, 1, 1), { "a": 1 }, (2, ), [1, 2, 3], None)]) ] schema = StructType([ StructField("byte1", ByteType(), False), StructField("byte2", ByteType(), False), StructField("short1", ShortType(), False), StructField("short2", ShortType(), False), StructField("int1", IntegerType(), False), StructField("float1", FloatType(), False), StructField("date1", DateType(), False), StructField("time1", TimestampType(), False), StructField("map1", MapType(StringType(), IntegerType(), False), False), StructField("struct1", StructType([StructField("b", ShortType(), False)]), False), StructField("list1", ArrayType(ByteType(), False), False), StructField("null1", DoubleType(), True) ]) dstream = self.ssc.queueStream(rdd) self.ssc.sql("drop table if exists testTable") self.ssc._snappycontext.createTable("testTable", "column", schema) schemdstream = self.ssc.createSchemaDStream(dstream, schema) def testFunction(df): df.write.format("column").mode("append").saveAsTable("testTable") schemdstream.foreachDataFrame(lambda df: testFunction(df)) self.ssc.sql("select count (*) from testTable").collect() self.ssc.start() self.ssc.awaitTermination(2) result = SnappyContext( self.sc).sql("select count(*) from testTable").collect() self.assertEqual(result[0][0], 1)
def update_table(self): sqlcontext = SnappyContext.getOrCreate(self.sc) newColumnvalues = Row(col1=7L) modifiedrows = sqlcontext.update(SnappyContextTests.tablename, "COL2 =2", newColumnvalues, "COL1") self.assertTrue(modifiedrows == 3)
class SnappyStreamingContext(StreamingContext): """ Main entry point for Snappy Spark Streaming functionality. A SnappyStreamingContext represents the connection to a Snappy cluster, and can be used to create L{DStream} various input sources. It can be from an existing L{SparkContext}. After creating and transforming DStreams, the streaming computation can be started and stopped using `context.start()` and `context.stop()`, respectively. `context.awaitTermination()` allows the current thread to wait for the termination of the context by `stop()` or by an exception. """ def __init__(self, sparkContext, batchDuration=None, jssc=None): """ Create a new StreamingContext. @param sparkContext: L{SparkContext} object. @param batchDuration: the time interval (in seconds) at which streaming data will be divided into batches """ self._sc = sparkContext self._jvm = self._sc._jvm self._jssc = jssc or self._initialize_context(self._sc, batchDuration) self._snappycontext = SnappyContext(sparkContext) @classmethod def _ensure_initialized(cls): SparkContext._ensure_initialized() gw = SparkContext._gateway java_import(gw.jvm, "org.apache.spark.streaming.*") java_import(gw.jvm, "org.apache.spark.streaming.api.*") java_import(gw.jvm, "org.apache.spark.streaming.api.java.*") java_import(gw.jvm, "org.apache.spark.streaming.api.python.*") # start callback server # getattr will fallback to JVM, so we cannot test by hasattr() if "_callback_server" not in gw.__dict__ or gw._callback_server is None: gw.callback_server_parameters.eager_load = True gw.callback_server_parameters.daemonize = True gw.callback_server_parameters.daemonize_connections = True gw.callback_server_parameters.port = 0 gw.start_callback_server(gw.callback_server_parameters) cbport = gw._callback_server.server_socket.getsockname()[1] gw._callback_server.port = cbport # gateway with real port gw._python_proxy_port = gw._callback_server.port # get the GatewayServer object in JVM by ID jgws = JavaObject("GATEWAY_SERVER", gw._gateway_client) # update the port of CallbackClient with real port gw.jvm.PythonDStream.updatePythonGatewayPort(jgws, gw._python_proxy_port) _py4j_cleaner = Py4jCallbackConnectionCleaner(gw) _py4j_cleaner.start() # register serializer for TransformFunction # it happens before creating SparkContext when loading from checkpointing if cls._transformerSerializer is None: transformer_serializer = TransformFunctionSerializer() transformer_serializer.init( SparkContext._active_spark_context, CloudPickleSerializer(), gw) # SPARK-12511 streaming driver with checkpointing unable to finalize leading to OOM # There is an issue that Py4J's PythonProxyHandler.finalize blocks forever. # (https://github.com/bartdag/py4j/pull/184) # # Py4j will create a PythonProxyHandler in Java for "transformer_serializer" when # calling "registerSerializer". If we call "registerSerializer" twice, the second # PythonProxyHandler will override the first one, then the first one will be GCed and # trigger "PythonProxyHandler.finalize". To avoid that, we should not call # "registerSerializer" more than once, so that "PythonProxyHandler" in Java side won't # be GCed. # # TODO Once Py4J fixes this issue, we should upgrade Py4j to the latest version. transformer_serializer.gateway.jvm.PythonDStream.registerSerializer( transformer_serializer) cls._transformerSerializer = transformer_serializer else: cls._transformerSerializer.init( SparkContext._active_spark_context, CloudPickleSerializer(), gw) def _initialize_context(self, sc, duration): self._ensure_initialized() return self._jvm.JavaSnappyStreamingContext(sc._jsc, self._jduration(duration)) @classmethod def getOrCreate(cls, checkpointPath, setupFunc): """ Either recreate a SnappyStreamingContext from checkpoint data or create a new SnappyStreamingContext. If checkpoint data exists in the provided `checkpointPath`, then SnappyStreamingContext will be recreated from the checkpoint data. If the data does not exist, then the provided setupFunc will be used to create a new context. @param checkpointPath: Checkpoint directory used in an earlier streaming program @param setupFunc: Function to create a new context and setup DStreams """ cls._ensure_initialized() gw = SparkContext._gateway # Check whether valid checkpoint information exists in the given path ssc_option = gw.jvm.SnappyStreamingContextPythonHelper().tryRecoverFromCheckpoint(checkpointPath) if ssc_option.isEmpty(): ssc = setupFunc() ssc.checkpoint(checkpointPath) return ssc jssc = gw.jvm.JavaSnappyStreamingContext(ssc_option.get()) # If there is already an active instance of Python SparkContext use it, or create a new one if not SparkContext._active_spark_context: jsc = jssc.sparkContext() conf = SparkConf(_jconf=jsc.getConf()) SparkContext(conf=conf, gateway=gw, jsc=jsc) sc = SparkContext._active_spark_context # update ctx in serializer cls._transformerSerializer.ctx = sc return SnappyStreamingContext(sc, None, jssc) @classmethod def getActive(cls): """ Return either the currently active SnappyStreamingContext (i.e., if there is a context started but not stopped) or None. """ activePythonContext = cls._activeContext if activePythonContext is not None: # Verify that the current running Java StreamingContext is active and is the same one # backing the supposedly active Python context activePythonContextJavaId = activePythonContext._jssc.ssc().hashCode() activeJvmContextOption = activePythonContext._jvm.SnappyStreamingContext.getActive() if activeJvmContextOption.isEmpty(): cls._activeContext = None elif activeJvmContextOption.get().hashCode() != activePythonContextJavaId: cls._activeContext = None raise Exception("JVM's active JavaStreamingContext is not the JavaStreamingContext " "backing the action Python StreamingContext. This is unexpected.") return cls._activeContext def start(self): """ Start the execution of the streams. """ self._jssc.start() SnappyStreamingContext._activeContext = self def sql(self , sqlText): """Returns a :class:`DataFrame` representing the result of the given query. :return: :class:`DataFrame` """ return self._snappycontext.sql(sqlText) def union(self, *dstreams): """ Create a unified DStream from multiple DStreams of the same type and same slide duration. """ if not dstreams: raise ValueError("should have at least one DStream to union") if len(dstreams) == 1: return dstreams[0] if len(set(s._jrdd_deserializer for s in dstreams)) > 1: raise ValueError("All DStreams should have same serializer") if len(set(s._slideDuration for s in dstreams)) > 1: raise ValueError("All DStreams should have same slide duration") first = dstreams[0] jrest = [d._jdstream for d in dstreams[1:]] return DStream(self._jssc.union(first._jdstream, jrest), self, first._jrdd_deserializer) def createSchemaDStream(self, dstream , schema): """ Creates a [[SchemaDStream]] from an DStream of Product" """ if not isinstance(schema, StructType): raise TypeError("schema should be StructType, but got %s" % type(schema)) if not isinstance(dstream, DStream): raise TypeError("dstream should be DStream, but got %s" % type(dstream)) return SchemaDStream(dstream._jdstream, self, dstream._jrdd_deserializer, schema)
def test_get_or_create(self): sqlcontext = SnappyContext.getOrCreate(self.sc) self.assertTrue(SnappyContext.getOrCreate(self.sc) is sqlcontext)
class SnappyStreamingContext(StreamingContext): """ Main entry point for Snappy Spark Streaming functionality. A SnappyStreamingContext represents the connection to a Snappy cluster, and can be used to create L{DStream} various input sources. It can be from an existing L{SparkContext}. After creating and transforming DStreams, the streaming computation can be started and stopped using `context.start()` and `context.stop()`, respectively. `context.awaitTermination()` allows the current thread to wait for the termination of the context by `stop()` or by an exception. """ def __init__(self, sparkContext, batchDuration=None, jssc=None): """ Create a new StreamingContext. @param sparkContext: L{SparkContext} object. @param batchDuration: the time interval (in seconds) at which streaming data will be divided into batches """ self._sc = sparkContext self._jvm = self._sc._jvm self._jssc = jssc or self._initialize_context(self._sc, batchDuration) self._snappycontext = SnappyContext(sparkContext) @classmethod def _ensure_initialized(cls): SparkContext._ensure_initialized() gw = SparkContext._gateway java_import(gw.jvm, "org.apache.spark.streaming.*") java_import(gw.jvm, "org.apache.spark.streaming.api.*") java_import(gw.jvm, "org.apache.spark.streaming.api.java.*") java_import(gw.jvm, "org.apache.spark.streaming.api.python.*") # start callback server # getattr will fallback to JVM, so we cannot test by hasattr() if "_callback_server" not in gw.__dict__ or gw._callback_server is None: gw.callback_server_parameters.eager_load = True gw.callback_server_parameters.daemonize = True gw.callback_server_parameters.daemonize_connections = True gw.callback_server_parameters.port = 0 gw.start_callback_server(gw.callback_server_parameters) cbport = gw._callback_server.server_socket.getsockname()[1] gw._callback_server.port = cbport # gateway with real port gw._python_proxy_port = gw._callback_server.port # get the GatewayServer object in JVM by ID jgws = JavaObject("GATEWAY_SERVER", gw._gateway_client) # update the port of CallbackClient with real port gw.jvm.PythonDStream.updatePythonGatewayPort( jgws, gw._python_proxy_port) _py4j_cleaner = Py4jCallbackConnectionCleaner(gw) _py4j_cleaner.start() # register serializer for TransformFunction # it happens before creating SparkContext when loading from checkpointing if cls._transformerSerializer is None: transformer_serializer = TransformFunctionSerializer() transformer_serializer.init(SparkContext._active_spark_context, CloudPickleSerializer(), gw) # SPARK-12511 streaming driver with checkpointing unable to finalize leading to OOM # There is an issue that Py4J's PythonProxyHandler.finalize blocks forever. # (https://github.com/bartdag/py4j/pull/184) # # Py4j will create a PythonProxyHandler in Java for "transformer_serializer" when # calling "registerSerializer". If we call "registerSerializer" twice, the second # PythonProxyHandler will override the first one, then the first one will be GCed and # trigger "PythonProxyHandler.finalize". To avoid that, we should not call # "registerSerializer" more than once, so that "PythonProxyHandler" in Java side won't # be GCed. # # TODO Once Py4J fixes this issue, we should upgrade Py4j to the latest version. transformer_serializer.gateway.jvm.PythonDStream.registerSerializer( transformer_serializer) cls._transformerSerializer = transformer_serializer else: cls._transformerSerializer.init(SparkContext._active_spark_context, CloudPickleSerializer(), gw) def _initialize_context(self, sc, duration): self._ensure_initialized() return self._jvm.JavaSnappyStreamingContext(sc._jsc, self._jduration(duration)) @classmethod def getOrCreate(cls, checkpointPath, setupFunc): """ Either recreate a SnappyStreamingContext from checkpoint data or create a new SnappyStreamingContext. If checkpoint data exists in the provided `checkpointPath`, then SnappyStreamingContext will be recreated from the checkpoint data. If the data does not exist, then the provided setupFunc will be used to create a new context. @param checkpointPath: Checkpoint directory used in an earlier streaming program @param setupFunc: Function to create a new context and setup DStreams """ cls._ensure_initialized() gw = SparkContext._gateway # Check whether valid checkpoint information exists in the given path ssc_option = gw.jvm.SnappyStreamingContextPythonHelper( ).tryRecoverFromCheckpoint(checkpointPath) if ssc_option.isEmpty(): ssc = setupFunc() ssc.checkpoint(checkpointPath) return ssc jssc = gw.jvm.JavaSnappyStreamingContext(ssc_option.get()) # If there is already an active instance of Python SparkContext use it, or create a new one if not SparkContext._active_spark_context: jsc = jssc.sparkContext() conf = SparkConf(_jconf=jsc.getConf()) SparkContext(conf=conf, gateway=gw, jsc=jsc) sc = SparkContext._active_spark_context # update ctx in serializer cls._transformerSerializer.ctx = sc return SnappyStreamingContext(sc, None, jssc) @classmethod def getActive(cls): """ Return either the currently active SnappyStreamingContext (i.e., if there is a context started but not stopped) or None. """ activePythonContext = cls._activeContext if activePythonContext is not None: # Verify that the current running Java StreamingContext is active and is the same one # backing the supposedly active Python context activePythonContextJavaId = activePythonContext._jssc.ssc( ).hashCode() activeJvmContextOption = activePythonContext._jvm.SnappyStreamingContext.getActive( ) if activeJvmContextOption.isEmpty(): cls._activeContext = None elif activeJvmContextOption.get().hashCode( ) != activePythonContextJavaId: cls._activeContext = None raise Exception( "JVM's active JavaStreamingContext is not the JavaStreamingContext " "backing the action Python StreamingContext. This is unexpected." ) return cls._activeContext def start(self): """ Start the execution of the streams. """ self._jssc.start() SnappyStreamingContext._activeContext = self def sql(self, sqlText): """Returns a :class:`DataFrame` representing the result of the given query. :return: :class:`DataFrame` """ return self._snappycontext.sql(sqlText) def union(self, *dstreams): """ Create a unified DStream from multiple DStreams of the same type and same slide duration. """ if not dstreams: raise ValueError("should have at least one DStream to union") if len(dstreams) == 1: return dstreams[0] if len(set(s._jrdd_deserializer for s in dstreams)) > 1: raise ValueError("All DStreams should have same serializer") if len(set(s._slideDuration for s in dstreams)) > 1: raise ValueError("All DStreams should have same slide duration") first = dstreams[0] jrest = [d._jdstream for d in dstreams[1:]] return DStream(self._jssc.union(first._jdstream, jrest), self, first._jrdd_deserializer) def createSchemaDStream(self, dstream, schema): """ Creates a [[SchemaDStream]] from an DStream of Product" """ if not isinstance(schema, StructType): raise TypeError("schema should be StructType, but got %s" % type(schema)) if not isinstance(dstream, DStream): raise TypeError("dstream should be DStream, but got %s" % type(dstream)) return SchemaDStream(dstream._jdstream, self, dstream._jrdd_deserializer, schema)
def test_delete(self): self.drop_table(True) self.create_table_using_datasource("row") sqlcontext = SnappyContext.getOrCreate(self.sc) self.assertTrue(sqlcontext.delete(SnappyContextTests.tablename, "col1=1"), 2) self.drop_table()
totalTimeCol = int(time.time() * 1000) - start print("Query time: %dms" % totalTimeCol) # Suppose a particular Airline company say 'Delta Air Lines Inc.' # re-brands itself as 'Delta America'.Update the row table. query = " CODE ='DL'" newColumnValues = ["Delta America Renewed"] sqlContext.update(ROW_TABLE_NAME, query, newColumnValues, ["DESCRIPTION"]) # Data Frame query :Which Airlines Arrive On Schedule? JOIN with reference table colResultAftUpd = airlineDF.alias('airlineDF') \ .join(airlineCodeDF.alias('airlineCodeDF'), col('airlineDF.UniqueCarrier') == col('airlineCodeDF.CODE')) \ .groupBy(col('airlineDF.UniqueCarrier'), col('airlineCodeDF.Description')) \ .agg({"ArrDelay": "avg"}). \ orderBy("avg(ArrDelay)") print("Airline arrival schedule after Updated values:") startColUpd = int(time.time() * 1000) colResultAftUpd.show() totalTimeColUpd = int(time.time() * 1000) - startColUpd print("Query time:%dms" % totalTimeColUpd) if __name__ == "__main__": # Configure Spark conf = SparkConf().setAppName(APP_NAME) sc = SparkContext(conf=conf) snc = SnappyContext(sc) main(snc)
def __init__(self, jdstream, ssc, jrdd_deserializer, schema): DStream.__init__(self, jdstream, ssc, jrdd_deserializer) self._schema = schema self._sqlcontext = SnappyContext(self._sc)
def update_table(self): sqlcontext = SnappyContext(self.sc) modifiedrows = sqlcontext.update(SnappyContextTests.tablename, "COL2 =2", [7L], ["COL1"]) self.assertTrue(modifiedrows == 3)
def verify_table_rows(self, rowcount): sqlcontext = SnappyContext(self.sc) result = sqlcontext.sql("SELECT COUNT(*) FROM " + SnappyContextTests.tablename).collect() self.assertTrue(result[0][0] == rowcount)
def truncate_table(self): sqlcontext = SnappyContext(self.sc) sqlcontext.truncateTable(SnappyContextTests.tablename)
def truncate_table(self): sqlcontext = SnappyContext.getOrCreate(self.sc) sqlcontext.truncateTable(SnappyContextTests.tablename)
def drop_table(self, ifexists=False): sqlcontext = SnappyContext(self.sc) sqlcontext.dropTable(SnappyContextTests.tablename, ifexists)
def verify_table_rows(self, rowcount): sqlcontext = SnappyContext.getOrCreate(self.sc) result = sqlcontext.sql("SELECT COUNT(*) FROM " + SnappyContextTests.tablename).collect() self.assertTrue(result[0]._c0 == rowcount)
def drop_table(self, ifexists=False): sqlcontext = SnappyContext.getOrCreate(self.sc) sqlcontext.dropTable(SnappyContextTests.tablename, ifexists)
def create_table_using_sql(self, ddl, provider): sqlcontext = SnappyContext.getOrCreate(self.sc) dataDF = sqlcontext._sc.parallelize(SnappyContextTests.testdata, 5).toDF() sqlcontext.sql("DROP TABLE IF EXISTS " + SnappyContextTests.tablename) sqlcontext.sql(ddl) dataDF.write.format(provider).mode("append").saveAsTable(SnappyContextTests.tablename)
def update_table(self): sqlcontext = SnappyContext.getOrCreate(self.sc) modifiedrows = sqlcontext.update(SnappyContextTests.tablename, "COL2 =2", [7L], ["COL1"]) self.assertTrue(modifiedrows == 3)