def test_schema_dstream(self): def rddList(start, end): return self.sc.parallelize(range( start, end)).map(lambda i: (i, "Text" + str(i))) def saveFunction(df): df.write.format("column").mode("append").saveAsTable( "streamingExample") schema = StructType([ StructField("loc", IntegerType()), StructField("text", StringType()) ]) snsc = SnappyStreamingContext(self.sc, 1) dstream = snsc.queueStream( [rddList(1, 10), rddList(10, 20), rddList(20, 30)]) snsc._snappycontext.dropTable("streamingExample", True) snsc._snappycontext.createTable("streamingExample", "column", schema) schemadstream = snsc.createSchemaDStream(dstream, schema) schemadstream.foreachDataFrame(lambda df: saveFunction(df)) snsc.start() time.sleep(1) snsc.sql("select count(*) from streamingExample").show()
def test_schema_dstream(self): def rddList(start, end): return self.sc.parallelize(range(start, end)).map(lambda i: (i, "Text" + str(i))) def saveFunction(df): df.write.format("column").mode("append").saveAsTable("streamingExample") schema = StructType([StructField("loc", IntegerType()), StructField("text", StringType())]) snsc = SnappyStreamingContext(self.sc, 1) dstream = snsc.queueStream([rddList(1, 10), rddList(10, 20), rddList(20, 30)]) snsc._snappycontext.dropTable("streamingExample", True) snsc._snappycontext.createTable("streamingExample", "column", schema) schemadstream = snsc.createSchemaDStream(dstream, schema) schemadstream.foreachDataFrame(lambda df: saveFunction(df)) snsc.start() time.sleep(1) snsc.sql("select count(*) from streamingExample").show()
def setupFunc(): ssc = SnappyStreamingContext(self.sc, self.duration) ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.setupCalled = True return ssc
class SnappyStreamingContextTests(StreamingContextTests): def setUp(self): self.ssc = SnappyStreamingContext(self.sc, self.duration) def tearDown(self): if self.ssc is not None: self.ssc.stop(False) # Clean up in the JVM just in case there has been some issues in Python API try: jStreamingContextOption = SnappyStreamingContext._jvm.SparkContext.getActive() if jStreamingContextOption.nonEmpty(): jStreamingContextOption.get().stop(False) except: pass def test_schema_dstream(self): rdd = [self.sc.parallelize([(127, -128, -32768, 32767, 2147483647, 1.0, date(2010, 1, 1), datetime(2010, 1, 1, 1, 1, 1), {"a": 1}, (2,), [1, 2, 3], None)])] schema = StructType([ StructField("byte1", ByteType(), False), StructField("byte2", ByteType(), False), StructField("short1", ShortType(), False), StructField("short2", ShortType(), False), StructField("int1", IntegerType(), False), StructField("float1", FloatType(), False), StructField("date1", DateType(), False), StructField("time1", TimestampType(), False), StructField("map1", MapType(StringType(), IntegerType(), False), False), StructField("struct1", StructType([StructField("b", ShortType(), False)]), False), StructField("list1", ArrayType(ByteType(), False), False), StructField("null1", DoubleType(), True)]) dstream = self.ssc.queueStream(rdd) self.ssc.sql("drop table if exists testTable") self.ssc._snappycontext.createTable("testTable", "column", schema) schemdstream = self.ssc.createSchemaDStream(dstream, schema) def testFunction(df): df.write.format("column").mode("append").saveAsTable("testTable") schemdstream.foreachDataFrame(lambda df: testFunction(df)) self.ssc.sql("select count (*) from testTable").collect() self.ssc.start() self.ssc.awaitTermination(2) result = SnappyContext(self.sc).sql("select count(*) from testTable").collect() self.assertEqual(result[0][0], 1) def test_text_file_stream(self): d = tempfile.mkdtemp() self.ssc = SnappyStreamingContext(self.sc, self.duration) dstream2 = self.ssc.textFileStream(d).map(int) result = self._collect(dstream2, 2, block=False) self.ssc.start() for name in ('a', 'b'): time.sleep(1) with open(os.path.join(d, name), "w") as f: f.writelines(["%d\n" % i for i in range(10)]) self.wait_for(result, 2) self.assertEqual([list(range(10)), list(range(10))], result) def test_binary_records_stream(self): d = tempfile.mkdtemp() self.ssc = SnappyStreamingContext(self.sc, self.duration) dstream = self.ssc.binaryRecordsStream(d, 10).map( lambda v: struct.unpack("10b", bytes(v))) result = self._collect(dstream, 2, block=False) self.ssc.start() for name in ('a', 'b'): time.sleep(1) with open(os.path.join(d, name), "wb") as f: f.write(bytearray(range(10))) self.wait_for(result, 2) self.assertEqual([list(range(10)), list(range(10))], [list(v[0]) for v in result]) def test_get_active(self): self.assertEqual(SnappyStreamingContext.getActive(), None) # Verify that getActive() returns the active context self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.ssc.start() self.assertEqual(SnappyStreamingContext.getActive(), self.ssc) # Verify that getActive() returns None self.ssc.stop(False) self.assertEqual(SnappyStreamingContext.getActive(), None) # Verify that if the Java context is stopped, then getActive() returns None self.ssc = SnappyStreamingContext(self.sc, self.duration) self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.ssc.start() self.assertEqual(SnappyStreamingContext.getActive(), self.ssc) self.ssc._jssc.stop(False) self.assertEqual(SnappyStreamingContext.getActive(), None) def test_get_active_or_create(self): # Test StreamingContext.getActiveOrCreate() without checkpoint data # See CheckpointTests for tests with checkpoint data self.ssc = None self.assertEqual(SnappyStreamingContext.getActive(), None) def setupFunc(): ssc = SnappyStreamingContext(self.sc, self.duration) ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.setupCalled = True return ssc # Verify that getActiveOrCreate() (w/o checkpoint) calls setupFunc when no context is active self.setupCalled = False self.ssc = SnappyStreamingContext.getActiveOrCreate(None, setupFunc) self.assertTrue(self.setupCalled) # Verify that getActiveOrCreate() retuns active context and does not call the setupFunc self.ssc.start() self.setupCalled = False self.assertEqual(SnappyStreamingContext.getActiveOrCreate(None, setupFunc), self.ssc) self.assertFalse(self.setupCalled) # Verify that getActiveOrCreate() calls setupFunc after active context is stopped self.ssc.stop(False) self.setupCalled = False self.ssc = SnappyStreamingContext.getActiveOrCreate(None, setupFunc) self.assertTrue(self.setupCalled) # Verify that if the Java context is stopped, then getActive() returns None self.ssc = SnappyStreamingContext(self.sc, self.duration) self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.ssc.start() self.assertEqual(SnappyStreamingContext.getActive(), self.ssc) self.ssc._jssc.stop(False) self.setupCalled = False self.ssc = SnappyStreamingContext.getActiveOrCreate(None, setupFunc) self.assertTrue(self.setupCalled)