def setup(): conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext(conf=conf) ssc = SnappyStreamingContext(sc, 0.5) # A function that cannot be serialized def process(time, rdd): sc.parallelize(range(1, 10)) ssc.textFileStream(inputd).foreachRDD(process) return ssc
def setup(): conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext(conf=conf) ssc = SnappyStreamingContext(sc, 0.5) dstream = ssc.textFileStream(inputd).map(lambda x: (x, 1)) wc = dstream.updateStateByKey(updater) wc.map(lambda x: "%s,%d" % x).saveAsTextFiles(outputd + "test") wc.checkpoint(.5) self.setupCalled = True return ssc
class SnappyStreamingContextTests(StreamingContextTests): def setUp(self): self.ssc = SnappyStreamingContext(self.sc, self.duration) def tearDown(self): if self.ssc is not None: self.ssc.stop(False) # Clean up in the JVM just in case there has been some issues in Python API try: jStreamingContextOption = SnappyStreamingContext._jvm.SparkContext.getActive() if jStreamingContextOption.nonEmpty(): jStreamingContextOption.get().stop(False) except: pass def test_schema_dstream(self): rdd = [self.sc.parallelize([(127, -128, -32768, 32767, 2147483647, 1.0, date(2010, 1, 1), datetime(2010, 1, 1, 1, 1, 1), {"a": 1}, (2,), [1, 2, 3], None)])] schema = StructType([ StructField("byte1", ByteType(), False), StructField("byte2", ByteType(), False), StructField("short1", ShortType(), False), StructField("short2", ShortType(), False), StructField("int1", IntegerType(), False), StructField("float1", FloatType(), False), StructField("date1", DateType(), False), StructField("time1", TimestampType(), False), StructField("map1", MapType(StringType(), IntegerType(), False), False), StructField("struct1", StructType([StructField("b", ShortType(), False)]), False), StructField("list1", ArrayType(ByteType(), False), False), StructField("null1", DoubleType(), True)]) dstream = self.ssc.queueStream(rdd) self.ssc.sql("drop table if exists testTable") self.ssc._snappycontext.createTable("testTable", "column", schema) schemdstream = self.ssc.createSchemaDStream(dstream, schema) def testFunction(df): df.write.format("column").mode("append").saveAsTable("testTable") schemdstream.foreachDataFrame(lambda df: testFunction(df)) self.ssc.sql("select count (*) from testTable").collect() self.ssc.start() self.ssc.awaitTermination(2) result = SnappyContext(self.sc).sql("select count(*) from testTable").collect() self.assertEqual(result[0][0], 1) def test_text_file_stream(self): d = tempfile.mkdtemp() self.ssc = SnappyStreamingContext(self.sc, self.duration) dstream2 = self.ssc.textFileStream(d).map(int) result = self._collect(dstream2, 2, block=False) self.ssc.start() for name in ('a', 'b'): time.sleep(1) with open(os.path.join(d, name), "w") as f: f.writelines(["%d\n" % i for i in range(10)]) self.wait_for(result, 2) self.assertEqual([list(range(10)), list(range(10))], result) def test_binary_records_stream(self): d = tempfile.mkdtemp() self.ssc = SnappyStreamingContext(self.sc, self.duration) dstream = self.ssc.binaryRecordsStream(d, 10).map( lambda v: struct.unpack("10b", bytes(v))) result = self._collect(dstream, 2, block=False) self.ssc.start() for name in ('a', 'b'): time.sleep(1) with open(os.path.join(d, name), "wb") as f: f.write(bytearray(range(10))) self.wait_for(result, 2) self.assertEqual([list(range(10)), list(range(10))], [list(v[0]) for v in result]) def test_get_active(self): self.assertEqual(SnappyStreamingContext.getActive(), None) # Verify that getActive() returns the active context self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.ssc.start() self.assertEqual(SnappyStreamingContext.getActive(), self.ssc) # Verify that getActive() returns None self.ssc.stop(False) self.assertEqual(SnappyStreamingContext.getActive(), None) # Verify that if the Java context is stopped, then getActive() returns None self.ssc = SnappyStreamingContext(self.sc, self.duration) self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.ssc.start() self.assertEqual(SnappyStreamingContext.getActive(), self.ssc) self.ssc._jssc.stop(False) self.assertEqual(SnappyStreamingContext.getActive(), None) def test_get_active_or_create(self): # Test StreamingContext.getActiveOrCreate() without checkpoint data # See CheckpointTests for tests with checkpoint data self.ssc = None self.assertEqual(SnappyStreamingContext.getActive(), None) def setupFunc(): ssc = SnappyStreamingContext(self.sc, self.duration) ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.setupCalled = True return ssc # Verify that getActiveOrCreate() (w/o checkpoint) calls setupFunc when no context is active self.setupCalled = False self.ssc = SnappyStreamingContext.getActiveOrCreate(None, setupFunc) self.assertTrue(self.setupCalled) # Verify that getActiveOrCreate() retuns active context and does not call the setupFunc self.ssc.start() self.setupCalled = False self.assertEqual(SnappyStreamingContext.getActiveOrCreate(None, setupFunc), self.ssc) self.assertFalse(self.setupCalled) # Verify that getActiveOrCreate() calls setupFunc after active context is stopped self.ssc.stop(False) self.setupCalled = False self.ssc = SnappyStreamingContext.getActiveOrCreate(None, setupFunc) self.assertTrue(self.setupCalled) # Verify that if the Java context is stopped, then getActive() returns None self.ssc = SnappyStreamingContext(self.sc, self.duration) self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.ssc.start() self.assertEqual(SnappyStreamingContext.getActive(), self.ssc) self.ssc._jssc.stop(False) self.setupCalled = False self.ssc = SnappyStreamingContext.getActiveOrCreate(None, setupFunc) self.assertTrue(self.setupCalled)