Example #1
0
 def test_csv(self):
     self.drop_table(True)
     self.create_table_using_datasource("row")
     sparkSession = SnappySession(self.sc)
     sparkSession.read.csv("../../test_support/kv.txt").write.insertInto(
         tableName=SnappyContextTests.tablename)
     self.drop_table()
Example #2
0
 def test_delete(self):
     self.drop_table(True)
     self.create_table_using_datasource("row")
     sparkSession = SnappySession(self.sc)
     self.assertTrue(
         sparkSession.delete(SnappyContextTests.tablename, "col1=1"), 2)
     self.drop_table()
Example #3
0
 def test_csv(self):
     self.drop_table(True)
     self.create_table_using_datasource("row")
     sparkSession = SnappySession(self.sc)
     csvPath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../test_support/kv.txt")
     sparkSession.read.csv(csvPath).write.insertInto(tableName = SnappyContextTests.tablename)
     self.drop_table()
Example #4
0
 def test_create_table_without_schema(self):
     self.drop_table(True)
     snappy = SnappySession(self.sc)
     #should use default provider which is parquet and schema will be picked from parquet file
     parquetPath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../test_support/kv.parquet")
     snappy.createTable(SnappyContextTests.tablename, path = parquetPath)
     self.verify_table_rows(3)
     self.drop_table()
Example #5
0
 def insert_table(self):
     sparkSession = SnappySession(self.sc)
     newrow = ((1, 2, 3), (2, 3, 4))
     sparkSession.insert(SnappyContextTests.tablename, newrow)
     self.verify_table_rows(7)
     newrow = [1, 2, 3]
     sparkSession.insert(SnappyContextTests.tablename , newrow)
     self.verify_table_rows(8)
Example #6
0
 def create_table_using_sql(self, ddl, provider):
     sparkSession = SnappySession(self.sc)
     schema = StructType().add("col1", IntegerType()).add("col2", IntegerType()).add("col3", IntegerType())
     input = SnappyContextTests.testdata
     dataDF = sparkSession.createDataFrame(input, schema)
     sparkSession.sql("DROP TABLE IF EXISTS " + SnappyContextTests.tablename)
     sparkSession.sql(ddl)
     dataDF.write.insertInto(SnappyContextTests.tablename)
Example #7
0
 def create_table_using_sql(self, ddl, provider):
     sparkSession = SnappySession(self.sc)
     dataDF = sparkSession._sc.parallelize(SnappyContextTests.testdata,
                                           5).toDF()
     sparkSession.sql("DROP TABLE IF EXISTS " +
                      SnappyContextTests.tablename)
     sparkSession.sql(ddl)
     dataDF.write.insertInto(SnappyContextTests.tablename)
Example #8
0
 def insert_table(self):
     sparkSession = SnappySession(self.sc)
     newrow = [1L, 2L, 3L], [2L, 3L, 4L]
     sparkSession.insert(SnappyContextTests.tablename, newrow)
     self.verify_table_rows(7)
     newrow = [1L, 2L, 3L]
     sparkSession.insert(SnappyContextTests.tablename, newrow)
     self.verify_table_rows(8)
Example #9
0
    def test_new_session(self):
        sqlSession1 = SnappySession(self.sc)
        sqlSession1.conf.set("test_key", "a")

        sqlSession2 = sqlSession1.newSession()
        sqlSession2.conf.set("test_key", "b")

        self.assertEqual(sqlSession1.conf.get("test_key", ""), "a")
        self.assertEqual(sqlSession2.conf.get("test_key", ""), "b")
Example #10
0
 def create_table_using_datasource(self, provider, schemaddl=False):
     sparkSession = SnappySession(self.sc)
     schema = StructType().add("col1", IntegerType()).add("col2", IntegerType()).add("col3", IntegerType())
     input = SnappyContextTests.testdata
     df = sparkSession.createDataFrame(input, schema)
     if schemaddl is False:
         sparkSession.createTable(SnappyContextTests.tablename, provider, schema)
     else:
         sparkSession.createTable(SnappyContextTests.tablename, provider, "(COL1 INT , COL2 INT , COL3 INT)")
     df.write.format("row").mode("append").saveAsTable(SnappyContextTests.tablename)
Example #11
0
 def create_table_using_datasource(self, provider, schemaddl=False):
     sparkSession = SnappySession(self.sc)
     df = sparkSession._sc.parallelize(SnappyContextTests.testdata,
                                       5).toDF(["COL1", "COL2", "COL3"])
     if schemaddl is False:
         sparkSession.createTable(SnappyContextTests.tablename, provider,
                                  df.schema)
     else:
         sparkSession.createTable(SnappyContextTests.tablename, provider,
                                  "(COL1 INT , COL2 INT , COL3 INT)")
     df.write.format("row").mode("append").saveAsTable(
         SnappyContextTests.tablename)
Example #12
0
    def __init__(self, sparkContext, batchDuration=None, jssc=None):
        """
        Create a new StreamingContext.

        @param sparkContext: L{SparkContext} object.
        @param batchDuration: the time interval (in seconds) at which streaming
                              data will be divided into batches
        """

        self._sc = sparkContext
        self._jvm = self._sc._jvm
        self._jssc = jssc or self._initialize_context(self._sc, batchDuration)
        self._snappySession = SnappySession(sparkContext)
Example #13
0
    def test_schema_dstream(self):
        rdd = [
            self.sc.parallelize([(127, -128, -32768, 32767, 2147483647, 1.0,
                                  date(2010, 1,
                                       1), datetime(2010, 1, 1, 1, 1, 1), {
                                           "a": 1
                                       }, (2, ), [1, 2, 3], None)])
        ]
        schema = StructType([
            StructField("byte1", ByteType(), False),
            StructField("byte2", ByteType(), False),
            StructField("short1", ShortType(), False),
            StructField("short2", ShortType(), False),
            StructField("int1", IntegerType(), False),
            StructField("float1", FloatType(), False),
            StructField("date1", DateType(), False),
            StructField("time1", TimestampType(), False),
            StructField("map1", MapType(StringType(), IntegerType(), False),
                        False),
            StructField("struct1",
                        StructType([StructField("b", ShortType(), False)]),
                        False),
            StructField("list1", ArrayType(ByteType(), False), False),
            StructField("null1", DoubleType(), True)
        ])

        dstream = self.ssc.queueStream(rdd)
        self.ssc.sql("drop  table if exists testTable")

        self.ssc._snappySession.createTable("testTable", "column", schema)

        schemdstream = self.ssc.createSchemaDStream(dstream, schema)

        def testFunction(df):
            df.write.format("column").mode("append").saveAsTable("testTable")

        schemdstream.foreachDataFrame(lambda df: testFunction(df))

        self.ssc.sql("select count (*)  from testTable").collect()
        self.ssc.start()
        self.ssc.awaitTermination(2)
        result = SnappySession(
            self.sc).sql("select count(*) from testTable").collect()
        self.assertEqual(result[0][0], 1)
Example #14
0
 def drop_table(self, ifexists=False):
     sparkSession = SnappySession(self.sc)
     sparkSession.dropTable(SnappyContextTests.tablename, ifexists)
Example #15
0
 def verify_table_rows(self, rowcount):
     sparkSession = SnappySession(self.sc)
     result = sparkSession.sql("SELECT COUNT(*) FROM " + SnappyContextTests.tablename).collect()
     self.assertTrue(result[0][0] == rowcount)
Example #16
0
 def truncate_table(self):
     sparkSession = SnappySession(self.sc)
     sparkSession.truncateTable(SnappyContextTests.tablename, True)
Example #17
0
 def update_table(self):
     sparkSession = SnappySession(self.sc)
     modifiedrows = sparkSession.update(SnappyContextTests.tablename, "COL2 =2", [7], ["COL1"])
     self.assertTrue(modifiedrows == 3)
Example #18
0
    def __init__(self, jdstream, ssc, jrdd_deserializer, schema):
        DStream.__init__(self, jdstream, ssc, jrdd_deserializer)

        self._schema = schema
        self._snappySession = SnappySession(self._sc)
Example #19
0
                       False,
                       PARTITION_BY='PS_PARTKEY')

    print
    print("Inserting data in PARTSUPP table using dataframe")
    tuples = [(100, 1, 5000, Decimal(100)), (200, 2, 50, Decimal(10)),
              (300, 3, 1000, Decimal(20)), (400, 4, 200, Decimal(30))]
    rdd = sc.parallelize(tuples)
    tuplesDF = snappy.createDataFrame(rdd, schema)
    tuplesDF.write.insertInto("PARTSUPP")
    print("Printing the contents of the PARTSUPP table")
    snappy.sql("SELECT * FROM PARTSUPP").show()

    print("Update the available quantity for PARTKEY 100")
    snappy.update("PARTSUPP", "PS_PARTKEY =100", [50000], ["PS_AVAILQTY"])
    print("Printing the contents of the PARTSUPP table after update")
    snappy.sql("SELECT * FROM PARTSUPP").show()

    print("Delete the records for PARTKEY 400")
    snappy.delete("PARTSUPP", "PS_PARTKEY =400")
    print("Printing the contents of the PARTSUPP table after delete")
    snappy.sql("SELECT * FROM PARTSUPP").show()

    print("****Done****")


if __name__ == "__main__":
    sc = SparkContext('local[*]', 'Python Example')
    snappy = SnappySession(sc)
    main(snappy)