Example #1
0
    def test_schema_dstream(self):
        def rddList(start, end):
            return self.sc.parallelize(range(
                start, end)).map(lambda i: (i, "Text" + str(i)))

        def saveFunction(df):
            df.write.format("column").mode("append").saveAsTable(
                "streamingExample")

        schema = StructType([
            StructField("loc", IntegerType()),
            StructField("text", StringType())
        ])

        snsc = SnappyStreamingContext(self.sc, 1)

        dstream = snsc.queueStream(
            [rddList(1, 10), rddList(10, 20),
             rddList(20, 30)])

        snsc._snappycontext.dropTable("streamingExample", True)
        snsc._snappycontext.createTable("streamingExample", "column", schema)

        schemadstream = snsc.createSchemaDStream(dstream, schema)
        schemadstream.foreachDataFrame(lambda df: saveFunction(df))
        snsc.start()
        time.sleep(1)

        snsc.sql("select count(*) from streamingExample").show()
Example #2
0
    def test_schema_dstream(self):
        def rddList(start, end):
          return self.sc.parallelize(range(start, end)).map(lambda i: (i, "Text" + str(i)))

        def saveFunction(df):
           df.write.format("column").mode("append").saveAsTable("streamingExample")

        schema = StructType([StructField("loc", IntegerType()),
                             StructField("text", StringType())])

        snsc = SnappyStreamingContext(self.sc, 1)

        dstream = snsc.queueStream([rddList(1, 10), rddList(10, 20), rddList(20, 30)])

        snsc._snappycontext.dropTable("streamingExample", True)
        snsc._snappycontext.createTable("streamingExample", "column", schema)

        schemadstream = snsc.createSchemaDStream(dstream, schema)
        schemadstream.foreachDataFrame(lambda df: saveFunction(df))
        snsc.start()
        time.sleep(1)

        snsc.sql("select count(*) from streamingExample").show()
Example #3
0
class SnappyStreamingContextTests(StreamingContextTests):

    def setUp(self):
        self.ssc = SnappyStreamingContext(self.sc, self.duration)

    def tearDown(self):
        if self.ssc is not None:
            self.ssc.stop(False)
        # Clean up in the JVM just in case there has been some issues in Python API
        try:
            jStreamingContextOption = SnappyStreamingContext._jvm.SparkContext.getActive()
            if jStreamingContextOption.nonEmpty():
                jStreamingContextOption.get().stop(False)
        except:
            pass

    def test_schema_dstream(self):
        rdd = [self.sc.parallelize([(127, -128, -32768, 32767, 2147483647, 1.0,
                                    date(2010, 1, 1), datetime(2010, 1, 1, 1, 1, 1),
                                    {"a": 1}, (2,), [1, 2, 3], None)])]
        schema = StructType([
          StructField("byte1", ByteType(), False),
          StructField("byte2", ByteType(), False),
          StructField("short1", ShortType(), False),
          StructField("short2", ShortType(), False),
          StructField("int1", IntegerType(), False),
          StructField("float1", FloatType(), False),
          StructField("date1", DateType(), False),
          StructField("time1", TimestampType(), False),
          StructField("map1", MapType(StringType(), IntegerType(), False), False),
          StructField("struct1", StructType([StructField("b", ShortType(), False)]), False),
          StructField("list1", ArrayType(ByteType(), False), False),
          StructField("null1", DoubleType(), True)])


        dstream = self.ssc.queueStream(rdd)
        self.ssc.sql("drop  table if exists testTable")

        self.ssc._snappycontext.createTable("testTable", "column", schema)

        schemdstream = self.ssc.createSchemaDStream(dstream, schema)

        def testFunction(df):
            df.write.format("column").mode("append").saveAsTable("testTable")

        schemdstream.foreachDataFrame(lambda df: testFunction(df))

        self.ssc.sql("select count (*)  from testTable").collect()
        self.ssc.start()
        self.ssc.awaitTermination(2)
        result = SnappyContext(self.sc).sql("select count(*) from testTable").collect()
        self.assertEqual(result[0][0], 1)

    def test_text_file_stream(self):
        d = tempfile.mkdtemp()
        self.ssc = SnappyStreamingContext(self.sc, self.duration)
        dstream2 = self.ssc.textFileStream(d).map(int)
        result = self._collect(dstream2, 2, block=False)
        self.ssc.start()
        for name in ('a', 'b'):
            time.sleep(1)
            with open(os.path.join(d, name), "w") as f:
                f.writelines(["%d\n" % i for i in range(10)])
        self.wait_for(result, 2)
        self.assertEqual([list(range(10)), list(range(10))], result)

    def test_binary_records_stream(self):
        d = tempfile.mkdtemp()
        self.ssc = SnappyStreamingContext(self.sc, self.duration)
        dstream = self.ssc.binaryRecordsStream(d, 10).map(
                lambda v: struct.unpack("10b", bytes(v)))
        result = self._collect(dstream, 2, block=False)
        self.ssc.start()
        for name in ('a', 'b'):
            time.sleep(1)
            with open(os.path.join(d, name), "wb") as f:
                f.write(bytearray(range(10)))
        self.wait_for(result, 2)
        self.assertEqual([list(range(10)), list(range(10))], [list(v[0]) for v in result])

    def test_get_active(self):
        self.assertEqual(SnappyStreamingContext.getActive(), None)

        # Verify that getActive() returns the active context
        self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count())
        self.ssc.start()
        self.assertEqual(SnappyStreamingContext.getActive(), self.ssc)

        # Verify that getActive() returns None
        self.ssc.stop(False)
        self.assertEqual(SnappyStreamingContext.getActive(), None)

        # Verify that if the Java context is stopped, then getActive() returns None
        self.ssc = SnappyStreamingContext(self.sc, self.duration)
        self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count())
        self.ssc.start()
        self.assertEqual(SnappyStreamingContext.getActive(), self.ssc)
        self.ssc._jssc.stop(False)
        self.assertEqual(SnappyStreamingContext.getActive(), None)

    def test_get_active_or_create(self):
        # Test StreamingContext.getActiveOrCreate() without checkpoint data
        # See CheckpointTests for tests with checkpoint data
        self.ssc = None
        self.assertEqual(SnappyStreamingContext.getActive(), None)

        def setupFunc():
            ssc = SnappyStreamingContext(self.sc, self.duration)
            ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count())
            self.setupCalled = True
            return ssc

        # Verify that getActiveOrCreate() (w/o checkpoint) calls setupFunc when no context is active
        self.setupCalled = False
        self.ssc = SnappyStreamingContext.getActiveOrCreate(None, setupFunc)
        self.assertTrue(self.setupCalled)

        # Verify that getActiveOrCreate() retuns active context and does not call the setupFunc
        self.ssc.start()
        self.setupCalled = False
        self.assertEqual(SnappyStreamingContext.getActiveOrCreate(None, setupFunc), self.ssc)
        self.assertFalse(self.setupCalled)

        # Verify that getActiveOrCreate() calls setupFunc after active context is stopped
        self.ssc.stop(False)
        self.setupCalled = False
        self.ssc = SnappyStreamingContext.getActiveOrCreate(None, setupFunc)
        self.assertTrue(self.setupCalled)

        # Verify that if the Java context is stopped, then getActive() returns None
        self.ssc = SnappyStreamingContext(self.sc, self.duration)
        self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count())
        self.ssc.start()
        self.assertEqual(SnappyStreamingContext.getActive(), self.ssc)
        self.ssc._jssc.stop(False)
        self.setupCalled = False
        self.ssc = SnappyStreamingContext.getActiveOrCreate(None, setupFunc)
        self.assertTrue(self.setupCalled)