Exemple #1
0
    def test_schema_dstream(self):
        def rddList(start, end):
            return self.sc.parallelize(range(
                start, end)).map(lambda i: (i, "Text" + str(i)))

        def saveFunction(df):
            df.write.format("column").mode("append").saveAsTable(
                "streamingExample")

        schema = StructType([
            StructField("loc", IntegerType()),
            StructField("text", StringType())
        ])

        snsc = SnappyStreamingContext(self.sc, 1)

        dstream = snsc.queueStream(
            [rddList(1, 10), rddList(10, 20),
             rddList(20, 30)])

        snsc._snappycontext.dropTable("streamingExample", True)
        snsc._snappycontext.createTable("streamingExample", "column", schema)

        schemadstream = snsc.createSchemaDStream(dstream, schema)
        schemadstream.foreachDataFrame(lambda df: saveFunction(df))
        snsc.start()
        time.sleep(1)

        snsc.sql("select count(*) from streamingExample").show()
Exemple #2
0
 def setup():
     conf = SparkConf().set("spark.default.parallelism", 1)
     sc = SparkContext(conf=conf)
     ssc = SnappyStreamingContext(sc, 0.5)
     dstream = ssc.textFileStream(inputd).map(lambda x: (x, 1))
     wc = dstream.updateStateByKey(updater)
     wc.map(lambda x: "%s,%d" % x).saveAsTextFiles(outputd + "test")
     wc.checkpoint(.5)
     self.setupCalled = True
     return ssc
Exemple #3
0
        def setup():
            conf = SparkConf().set("spark.default.parallelism", 1)
            sc = SparkContext(conf=conf)
            ssc = SnappyStreamingContext(sc, 0.5)

            # A function that cannot be serialized
            def process(time, rdd):
                sc.parallelize(range(1, 10))

            ssc.textFileStream(inputd).foreachRDD(process)
            return ssc
Exemple #4
0
 def test_text_file_stream(self):
     d = tempfile.mkdtemp()
     self.ssc = SnappyStreamingContext(self.sc, self.duration)
     dstream2 = self.ssc.textFileStream(d).map(int)
     result = self._collect(dstream2, 2, block=False)
     self.ssc.start()
     for name in ('a', 'b'):
         time.sleep(1)
         with open(os.path.join(d, name), "w") as f:
             f.writelines(["%d\n" % i for i in range(10)])
     self.wait_for(result, 2)
     self.assertEqual([list(range(10)), list(range(10))], result)
Exemple #5
0
 def test_binary_records_stream(self):
     d = tempfile.mkdtemp()
     self.ssc = SnappyStreamingContext(self.sc, self.duration)
     dstream = self.ssc.binaryRecordsStream(d, 10).map(
             lambda v: struct.unpack("10b", bytes(v)))
     result = self._collect(dstream, 2, block=False)
     self.ssc.start()
     for name in ('a', 'b'):
         time.sleep(1)
         with open(os.path.join(d, name), "wb") as f:
             f.write(bytearray(range(10)))
     self.wait_for(result, 2)
     self.assertEqual([list(range(10)), list(range(10))], [list(v[0]) for v in result])
Exemple #6
0
    def test_get_active(self):
        self.assertEqual(SnappyStreamingContext.getActive(), None)

        # Verify that getActive() returns the active context
        self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count())
        self.ssc.start()
        self.assertEqual(SnappyStreamingContext.getActive(), self.ssc)

        # Verify that getActive() returns None
        self.ssc.stop(False)
        self.assertEqual(SnappyStreamingContext.getActive(), None)

        # Verify that if the Java context is stopped, then getActive() returns None
        self.ssc = SnappyStreamingContext(self.sc, self.duration)
        self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count())
        self.ssc.start()
        self.assertEqual(SnappyStreamingContext.getActive(), self.ssc)
        self.ssc._jssc.stop(False)
        self.assertEqual(SnappyStreamingContext.getActive(), None)
Exemple #7
0
    def test_get_active_or_create(self):
        # Test StreamingContext.getActiveOrCreate() without checkpoint data
        # See CheckpointTests for tests with checkpoint data
        self.ssc = None
        self.assertEqual(SnappyStreamingContext.getActive(), None)

        def setupFunc():
            ssc = SnappyStreamingContext(self.sc, self.duration)
            ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count())
            self.setupCalled = True
            return ssc

        # Verify that getActiveOrCreate() (w/o checkpoint) calls setupFunc when no context is active
        self.setupCalled = False
        self.ssc = SnappyStreamingContext.getActiveOrCreate(None, setupFunc)
        self.assertTrue(self.setupCalled)

        # Verify that getActiveOrCreate() retuns active context and does not call the setupFunc
        self.ssc.start()
        self.setupCalled = False
        self.assertEqual(
            SnappyStreamingContext.getActiveOrCreate(None, setupFunc),
            self.ssc)
        self.assertFalse(self.setupCalled)

        # Verify that getActiveOrCreate() calls setupFunc after active context is stopped
        self.ssc.stop(False)
        self.setupCalled = False
        self.ssc = SnappyStreamingContext.getActiveOrCreate(None, setupFunc)
        self.assertTrue(self.setupCalled)

        # Verify that if the Java context is stopped, then getActive() returns None
        self.ssc = SnappyStreamingContext(self.sc, self.duration)
        self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count())
        self.ssc.start()
        self.assertEqual(SnappyStreamingContext.getActive(), self.ssc)
        self.ssc._jssc.stop(False)
        self.setupCalled = False
        self.ssc = SnappyStreamingContext.getActiveOrCreate(None, setupFunc)
        self.assertTrue(self.setupCalled)
Exemple #8
0
 def setUp(self):
      self.ssc = SnappyStreamingContext(self.sc, self.duration)
Exemple #9
0
 def setupFunc():
     ssc = SnappyStreamingContext(self.sc, self.duration)
     ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count())
     self.setupCalled = True
     return ssc