Python SnappySession Examples, pyspark.sql.snappy.SnappySession Python Examples

Example #1

0

Show file

 def test_delete(self):
     self.drop_table(True)
     self.create_table_using_datasource("row")
     sparkSession = SnappySession(self.sc)
     self.assertTrue(
         sparkSession.delete(SnappyContextTests.tablename, "col1=1"), 2)
     self.drop_table()

Example #2

0

Show file

File: tests.py Project: ziqian/snappydata

 def test_create_table_without_schema(self):
     self.drop_table(True)
     snappy = SnappySession(self.sc)
     #should use default provider which is parquet and schema will be picked from parquet file
     parquetPath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../test_support/kv.parquet")
     snappy.createTable(SnappyContextTests.tablename, path = parquetPath)
     self.verify_table_rows(3)
     self.drop_table()

Example #3

0

Show file

File: tests.py Project: SnappyDataInc/snappydata

 def insert_table(self):
     sparkSession = SnappySession(self.sc)
     newrow = [1L, 2L, 3L], [2L, 3L, 4L]
     sparkSession.insert(SnappyContextTests.tablename, newrow)
     self.verify_table_rows(7)
     newrow = [1L, 2L, 3L]
     sparkSession.insert(SnappyContextTests.tablename, newrow)
     self.verify_table_rows(8)

Example #4

0

Show file

File: tests.py Project: SnappyDataInc/snappydata

 def create_table_using_datasource(self, provider, schemaddl=False):
     sparkSession = SnappySession(self.sc)
     df = sparkSession._sc.parallelize(SnappyContextTests.testdata, 5).toDF(["COL1", "COL2", "COL3"])
     if schemaddl is False:
         sparkSession.createTable(SnappyContextTests.tablename, provider, df.schema)
     else:
         sparkSession.createTable(SnappyContextTests.tablename, provider, "(COL1 INT , COL2 INT , COL3 INT)")
     df.write.format("row").mode("append").saveAsTable(SnappyContextTests.tablename)

Example #5

0

Show file

File: tests.py Project: ziqian/snappydata

    def test_new_session(self):
        sqlSession1 = SnappySession(self.sc)
        sqlSession1.conf.set("test_key", "a")

        sqlSession2 = sqlSession1.newSession()
        sqlSession2.conf.set("test_key", "b")

        self.assertEqual(sqlSession1.conf.get("test_key", ""), "a")
        self.assertEqual(sqlSession2.conf.get("test_key", ""), "b")

Example #6

0

Show file

File: tests.py Project: SnappyDataInc/snappydata

    def test_new_session(self):
        sqlSession1 = SnappySession(self.sc)
        sqlSession1.conf.set("test_key", "a")

        sqlSession2 = sqlSession1.newSession()
        sqlSession2.conf.set("test_key", "b")

        self.assertEqual(sqlSession1.conf.get("test_key", ""), "a")
        self.assertEqual(sqlSession2.conf.get("test_key", ""), "b")

Example #7

0

Show file

    def __init__(self, sparkContext, batchDuration=None, jssc=None):
        """
        Create a new StreamingContext.

        @param sparkContext: L{SparkContext} object.
        @param batchDuration: the time interval (in seconds) at which streaming
                              data will be divided into batches
        """

        self._sc = sparkContext
        self._jvm = self._sc._jvm
        self._jssc = jssc or self._initialize_context(self._sc, batchDuration)
        self._snappySession = SnappySession(sparkContext)

Example #8

0

Show file

File: tests.py Project: ziqian/snappydata

 def test_csv(self):
     self.drop_table(True)
     self.create_table_using_datasource("row")
     sparkSession = SnappySession(self.sc)
     csvPath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../test_support/kv.txt")
     sparkSession.read.csv(csvPath).write.insertInto(tableName = SnappyContextTests.tablename)
     self.drop_table()

Example #9

0

Show file

 def test_csv(self):
     self.drop_table(True)
     self.create_table_using_datasource("row")
     sparkSession = SnappySession(self.sc)
     sparkSession.read.csv("../../test_support/kv.txt").write.insertInto(
         tableName=SnappyContextTests.tablename)
     self.drop_table()

Example #10

0

Show file

File: tests.py Project: ziqian/snappydata

 def create_table_using_sql(self, ddl, provider):
     sparkSession = SnappySession(self.sc)
     schema = StructType().add("col1", IntegerType()).add("col2", IntegerType()).add("col3", IntegerType())
     input = SnappyContextTests.testdata
     dataDF = sparkSession.createDataFrame(input, schema)
     sparkSession.sql("DROP TABLE IF EXISTS " + SnappyContextTests.tablename)
     sparkSession.sql(ddl)
     dataDF.write.insertInto(SnappyContextTests.tablename)

Example #11

0

Show file

File: tests.py Project: ziqian/snappydata

 def create_table_using_datasource(self, provider, schemaddl=False):
     sparkSession = SnappySession(self.sc)
     schema = StructType().add("col1", IntegerType()).add("col2", IntegerType()).add("col3", IntegerType())
     input = SnappyContextTests.testdata
     df = sparkSession.createDataFrame(input, schema)
     if schemaddl is False:
         sparkSession.createTable(SnappyContextTests.tablename, provider, schema)
     else:
         sparkSession.createTable(SnappyContextTests.tablename, provider, "(COL1 INT , COL2 INT , COL3 INT)")
     df.write.format("row").mode("append").saveAsTable(SnappyContextTests.tablename)

Example #12

0

Show file

 def create_table_using_sql(self, ddl, provider):
     sparkSession = SnappySession(self.sc)
     dataDF = sparkSession._sc.parallelize(SnappyContextTests.testdata,
                                           5).toDF()
     sparkSession.sql("DROP TABLE IF EXISTS " +
                      SnappyContextTests.tablename)
     sparkSession.sql(ddl)
     dataDF.write.insertInto(SnappyContextTests.tablename)

Example #13

0

Show file

 def insert_table(self):
     sparkSession = SnappySession(self.sc)
     newrow = [1L, 2L, 3L], [2L, 3L, 4L]
     sparkSession.insert(SnappyContextTests.tablename, newrow)
     self.verify_table_rows(7)
     newrow = [1L, 2L, 3L]
     sparkSession.insert(SnappyContextTests.tablename, newrow)
     self.verify_table_rows(8)

Example #14

0

Show file

File: tests.py Project: ziqian/snappydata

 def insert_table(self):
     sparkSession = SnappySession(self.sc)
     newrow = ((1, 2, 3), (2, 3, 4))
     sparkSession.insert(SnappyContextTests.tablename, newrow)
     self.verify_table_rows(7)
     newrow = [1, 2, 3]
     sparkSession.insert(SnappyContextTests.tablename , newrow)
     self.verify_table_rows(8)

Example #15

0

Show file

 def create_table_using_datasource(self, provider, schemaddl=False):
     sparkSession = SnappySession(self.sc)
     df = sparkSession._sc.parallelize(SnappyContextTests.testdata,
                                       5).toDF(["COL1", "COL2", "COL3"])
     if schemaddl is False:
         sparkSession.createTable(SnappyContextTests.tablename, provider,
                                  df.schema)
     else:
         sparkSession.createTable(SnappyContextTests.tablename, provider,
                                  "(COL1 INT , COL2 INT , COL3 INT)")
     df.write.format("row").mode("append").saveAsTable(
         SnappyContextTests.tablename)

Example #16

0

Show file

File: tests.py Project: zuoqin720/snappydata

    def test_schema_dstream(self):
        rdd = [
            self.sc.parallelize([(127, -128, -32768, 32767, 2147483647, 1.0,
                                  date(2010, 1,
                                       1), datetime(2010, 1, 1, 1, 1, 1), {
                                           "a": 1
                                       }, (2, ), [1, 2, 3], None)])
        ]
        schema = StructType([
            StructField("byte1", ByteType(), False),
            StructField("byte2", ByteType(), False),
            StructField("short1", ShortType(), False),
            StructField("short2", ShortType(), False),
            StructField("int1", IntegerType(), False),
            StructField("float1", FloatType(), False),
            StructField("date1", DateType(), False),
            StructField("time1", TimestampType(), False),
            StructField("map1", MapType(StringType(), IntegerType(), False),
                        False),
            StructField("struct1",
                        StructType([StructField("b", ShortType(), False)]),
                        False),
            StructField("list1", ArrayType(ByteType(), False), False),
            StructField("null1", DoubleType(), True)
        ])

        dstream = self.ssc.queueStream(rdd)
        self.ssc.sql("drop  table if exists testTable")

        self.ssc._snappySession.createTable("testTable", "column", schema)

        schemdstream = self.ssc.createSchemaDStream(dstream, schema)

        def testFunction(df):
            df.write.format("column").mode("append").saveAsTable("testTable")

        schemdstream.foreachDataFrame(lambda df: testFunction(df))

        self.ssc.sql("select count (*)  from testTable").collect()
        self.ssc.start()
        self.ssc.awaitTermination(2)
        result = SnappySession(
            self.sc).sql("select count(*) from testTable").collect()
        self.assertEqual(result[0][0], 1)

Example #17

0

Show file

File: snappydstream.py Project: ziqian/snappydata

class SchemaDStream(DStream):
    """
     A SQL based DStream with support for schema/Product
     This class offers the ability to manipulate SQL query on DStreams
     It is similar to SchemaRDD, which offers the similar functions
     Internally, RDD of each batch duration is treated as a small
     table and CQs are evaluated on those small tables
     Some of the abstraction and code is borrowed from the project:
     https://github.com/Intel-bigdata/spark-streamingsql
     @param snsc
     @param queryExecution
    """
    def __init__(self, jdstream, ssc, jrdd_deserializer, schema):
        DStream.__init__(self, jdstream, ssc, jrdd_deserializer)

        self._schema = schema
        self._snappySession = SnappySession(self._sc)

    def foreachDataFrame(self, func):
        def createDataFrame(_, rdd):
            df = self._snappySession.createDataFrame(rdd, self._schema)
            func(df)

        self.foreachRDD(createDataFrame)

Example #18

0

Show file

File: tests.py Project: SnappyDataInc/snappydata

 def verify_table_rows(self, rowcount):
     sparkSession = SnappySession(self.sc)
     result = sparkSession.sql("SELECT COUNT(*) FROM " + SnappyContextTests.tablename).collect()
     self.assertTrue(result[0][0] == rowcount)

Example #19

0

Show file

File: tests.py Project: SnappyDataInc/snappydata

 def truncate_table(self):
     sparkSession = SnappySession(self.sc)
     sparkSession.truncateTable(SnappyContextTests.tablename)

Example #20

0

Show file

File: tests.py Project: SnappyDataInc/snappydata

 def update_table(self):
     sparkSession = SnappySession(self.sc)
     modifiedrows = sparkSession.update(SnappyContextTests.tablename, "COL2 =2", [7L], ["COL1"])
     self.assertTrue(modifiedrows == 3)

Example #21

0

Show file

File: tests.py Project: SnappyDataInc/snappydata

 def test_delete(self):
     self.drop_table(True)
     self.create_table_using_datasource("row")
     sparkSession = SnappySession(self.sc)
     self.assertTrue(sparkSession.delete(SnappyContextTests.tablename, "col1=1"), 2)
     self.drop_table()

Example #22

0

Show file

File: snappydstream.py Project: ziqian/snappydata

    def __init__(self, jdstream, ssc, jrdd_deserializer, schema):
        DStream.__init__(self, jdstream, ssc, jrdd_deserializer)

        self._schema = schema
        self._snappySession = SnappySession(self._sc)

Example #23

0

Show file

File: tests.py Project: ziqian/snappydata

 def drop_table(self, ifexists=False):
     sparkSession = SnappySession(self.sc)
     sparkSession.dropTable(SnappyContextTests.tablename, ifexists)

Example #24

0

Show file

File: tests.py Project: ziqian/snappydata

 def verify_table_rows(self, rowcount):
     sparkSession = SnappySession(self.sc)
     result = sparkSession.sql("SELECT COUNT(*) FROM " + SnappyContextTests.tablename).collect()
     self.assertTrue(result[0][0] == rowcount)

Example #25

0

Show file

File: CreateTable.py Project: zuoqin720/snappydata

                       False,
                       PARTITION_BY='PS_PARTKEY')

    print
    print("Inserting data in PARTSUPP table using dataframe")
    tuples = [(100, 1, 5000, Decimal(100)), (200, 2, 50, Decimal(10)),
              (300, 3, 1000, Decimal(20)), (400, 4, 200, Decimal(30))]
    rdd = sc.parallelize(tuples)
    tuplesDF = snappy.createDataFrame(rdd, schema)
    tuplesDF.write.insertInto("PARTSUPP")
    print("Printing the contents of the PARTSUPP table")
    snappy.sql("SELECT * FROM PARTSUPP").show()

    print("Update the available quantity for PARTKEY 100")
    snappy.update("PARTSUPP", "PS_PARTKEY =100", [50000], ["PS_AVAILQTY"])
    print("Printing the contents of the PARTSUPP table after update")
    snappy.sql("SELECT * FROM PARTSUPP").show()

    print("Delete the records for PARTKEY 400")
    snappy.delete("PARTSUPP", "PS_PARTKEY =400")
    print("Printing the contents of the PARTSUPP table after delete")
    snappy.sql("SELECT * FROM PARTSUPP").show()

    print("****Done****")


if __name__ == "__main__":
    sc = SparkContext('local[*]', 'Python Example')
    snappy = SnappySession(sc)
    main(snappy)

Example #26

0

Show file

File: tests.py Project: SnappyDataInc/snappydata

 def drop_table(self, ifexists=False):
     sparkSession = SnappySession(self.sc)
     sparkSession.dropTable(SnappyContextTests.tablename, ifexists)

Example #27

0

Show file

File: tests.py Project: SnappyDataInc/snappydata

 def create_table_using_sql(self, ddl, provider):
     sparkSession = SnappySession(self.sc)
     dataDF = sparkSession._sc.parallelize(SnappyContextTests.testdata, 5).toDF()
     sparkSession.sql("DROP TABLE IF EXISTS " + SnappyContextTests.tablename)
     sparkSession.sql(ddl)
     dataDF.write.insertInto(SnappyContextTests.tablename)

Example #28

0

Show file

File: tests.py Project: ziqian/snappydata

 def update_table(self):
     sparkSession = SnappySession(self.sc)
     modifiedrows = sparkSession.update(SnappyContextTests.tablename, "COL2 =2", [7], ["COL1"])
     self.assertTrue(modifiedrows == 3)

Example #29

0

Show file

File: tests.py Project: ziqian/snappydata

 def truncate_table(self):
     sparkSession = SnappySession(self.sc)
     sparkSession.truncateTable(SnappyContextTests.tablename, True)

Example #30

0

Show file

class SnappyStreamingContext(StreamingContext):
    """
    Main entry point for Snappy Spark Streaming functionality. A SnappyStreamingContext
    represents the connection to a Snappy cluster, and can be used to create
    L{DStream} various input sources. It can be from an existing L{SparkContext}.
    After creating and transforming DStreams, the streaming computation can
    be started and stopped using `context.start()` and `context.stop()`,
    respectively. `context.awaitTermination()` allows the current thread
    to wait for the termination of the context by `stop()` or by an exception.
    """
    _transformerSerializer = None

    def __init__(self, sparkContext, batchDuration=None, jssc=None):
        """
        Create a new StreamingContext.

        @param sparkContext: L{SparkContext} object.
        @param batchDuration: the time interval (in seconds) at which streaming
                              data will be divided into batches
        """

        self._sc = sparkContext
        self._jvm = self._sc._jvm
        self._jssc = jssc or self._initialize_context(self._sc, batchDuration)
        self._snappySession = SnappySession(sparkContext)
        # self._snappycontext = SnappyContext(sparkContext, snappySession)

    @classmethod
    def _ensure_initialized(cls):
        SparkContext._ensure_initialized()
        gw = SparkContext._gateway

        java_import(gw.jvm, "org.apache.spark.streaming.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.java.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.python.*")

        # start callback server
        # getattr will fallback to JVM, so we cannot test by hasattr()
        if "_callback_server" not in gw.__dict__ or gw._callback_server is None:
            gw.callback_server_parameters.eager_load = True
            gw.callback_server_parameters.daemonize = True
            gw.callback_server_parameters.daemonize_connections = True
            gw.callback_server_parameters.port = 0
            gw.start_callback_server(gw.callback_server_parameters)
            cbport = gw._callback_server.server_socket.getsockname()[1]
            gw._callback_server.port = cbport
            # gateway with real port
            gw._python_proxy_port = gw._callback_server.port
            # get the GatewayServer object in JVM by ID
            jgws = JavaObject("GATEWAY_SERVER", gw._gateway_client)
            # update the port of CallbackClient with real port
            jgws.resetCallbackClient(jgws.getCallbackClient().getAddress(), gw._python_proxy_port)

        # register serializer for TransformFunction
        # it happens before creating SparkContext when loading from checkpointing
        cls._transformerSerializer = TransformFunctionSerializer(
            SparkContext._active_spark_context, CloudPickleSerializer(), gw)


    def _initialize_context(self, sc, duration):
        self._ensure_initialized()
        return self._jvm.JavaSnappyStreamingContext(sc._jsc, self._jduration(duration))

    @classmethod
    def getOrCreate(cls, checkpointPath, setupFunc):
        """
        Either recreate a SnappyStreamingContext from checkpoint data or create a new SnappyStreamingContext.
        If checkpoint data exists in the provided `checkpointPath`, then SnappyStreamingContext will be
        recreated from the checkpoint data. If the data does not exist, then the provided setupFunc
        will be used to create a new context.

        @param checkpointPath: Checkpoint directory used in an earlier streaming program
        @param setupFunc:      Function to create a new context and setup DStreams
        """
        cls._ensure_initialized()
        gw = SparkContext._gateway

        # Check whether valid checkpoint information exists in the given path
        ssc_option = gw.jvm.SnappyStreamingContextPythonHelper().tryRecoverFromCheckpoint(checkpointPath)
        if ssc_option.isEmpty():
            ssc = setupFunc()
            ssc.checkpoint(checkpointPath)
            return ssc

        jssc = gw.jvm.JavaSnappyStreamingContext(ssc_option.get())

        # If there is already an active instance of Python SparkContext use it, or create a new one
        if not SparkContext._active_spark_context:
            jsc = jssc.sparkContext()
            conf = SparkConf(_jconf=jsc.getConf())
            SparkContext(conf=conf, gateway=gw, jsc=jsc)

        sc = SparkContext._active_spark_context

        # update ctx in serializer
        cls._transformerSerializer.ctx = sc
        return SnappyStreamingContext(sc, None, jssc)

    @classmethod
    def getActive(cls):
        """
        Return either the currently active SnappyStreamingContext (i.e., if there is a context started
        but not stopped) or None.
        """
        activePythonContext = cls._activeContext
        if activePythonContext is not None:
            # Verify that the current running Java StreamingContext is active and is the same one
            # backing the supposedly active Python context
            activePythonContextJavaId = activePythonContext._jssc.ssc().hashCode()
            activeJvmContextOption = activePythonContext._jvm.SnappyStreamingContext.getActive()

            if activeJvmContextOption.isEmpty():
                cls._activeContext = None
            elif activeJvmContextOption.get().hashCode() != activePythonContextJavaId:
                cls._activeContext = None
                raise Exception("JVM's active JavaStreamingContext is not the JavaStreamingContext "
                                "backing the action Python StreamingContext. This is unexpected.")
        return cls._activeContext

    def start(self):
        """
        Start the execution of the streams.
        """
        self._jssc.start()
        SnappyStreamingContext._activeContext = self


    def sql(self ,  sqlText):
        """Returns a :class:`DataFrame` representing the result of the given query.
        :return: :class:`DataFrame`
        """
        return self._snappySession.sql(sqlText)

    def union(self, *dstreams):
        """
        Create a unified DStream from multiple DStreams of the same
        type and same slide duration.
        """
        if not dstreams:
            raise ValueError("should have at least one DStream to union")
        if len(dstreams) == 1:
            return dstreams[0]
        if len(set(s._jrdd_deserializer for s in dstreams)) > 1:
            raise ValueError("All DStreams should have same serializer")
        if len(set(s._slideDuration for s in dstreams)) > 1:
            raise ValueError("All DStreams should have same slide duration")
        first = dstreams[0]
        jrest = [d._jdstream for d in dstreams[1:]]
        return DStream(self._jssc.union(first._jdstream, jrest), self, first._jrdd_deserializer)

    def createSchemaDStream(self, dstream , schema):
        """
        Creates a [[SchemaDStream]] from an DStream of Product"
        """
        if not isinstance(schema, StructType):
            raise TypeError("schema should be StructType, but got %s" % type(schema))
        if not isinstance(dstream, DStream):
            raise TypeError("dstream should be DStream, but got %s" % type(dstream))
        return SchemaDStream(dstream._jdstream, self, dstream._jrdd_deserializer, schema)