def setup(): conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext(conf=conf) ssc = SnappyStreamingContext(sc, 0.5) dstream = ssc.textFileStream(inputd).map(lambda x: (x, 1)) wc = dstream.updateStateByKey(updater) wc.map(lambda x: "%s,%d" % x).saveAsTextFiles(outputd + "test") wc.checkpoint(.5) self.setupCalled = True return ssc
def setup(): conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext(conf=conf) ssc = SnappyStreamingContext(sc, 0.5) # A function that cannot be serialized def process(time, rdd): sc.parallelize(range(1, 10)) ssc.textFileStream(inputd).foreachRDD(process) return ssc
def setUpClass(cls): class_name = cls.__name__ conf = SparkConf().set("spark.default.parallelism", 1) cls.sc = SparkContext(appName=class_name, conf=conf) cls.sc.setCheckpointDir("/tmp")
def test_get_or_create_and_get_active_or_create(self): inputd = tempfile.mkdtemp() outputd = tempfile.mkdtemp() + "/" def updater(vs, s): return sum(vs, s or 0) def setup(): conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext(conf=conf) ssc = SnappyStreamingContext(sc, 0.5) dstream = ssc.textFileStream(inputd).map(lambda x: (x, 1)) wc = dstream.updateStateByKey(updater) wc.map(lambda x: "%s,%d" % x).saveAsTextFiles(outputd + "test") wc.checkpoint(.5) self.setupCalled = True return ssc # Verify that getOrCreate() calls setup() in absence of checkpoint files self.cpd = tempfile.mkdtemp("test_streaming_cps") self.setupCalled = False self.ssc = SnappyStreamingContext.getOrCreate(self.cpd, setup) self.assertTrue(self.setupCalled) self.ssc.start() def check_output(n): while not os.listdir(outputd): time.sleep(0.01) time.sleep(1) # make sure mtime is larger than the previous one with open(os.path.join(inputd, str(n)), 'w') as f: f.writelines(["%d\n" % i for i in range(10)]) while True: p = os.path.join(outputd, max(os.listdir(outputd))) if '_SUCCESS' not in os.listdir(p): # not finished time.sleep(0.01) continue ordd = self.ssc.sparkContext.textFile(p).map(lambda line: line.split(",")) d = ordd.values().map(int).collect() if not d: time.sleep(0.01) continue self.assertEqual(10, len(d)) s = set(d) self.assertEqual(1, len(s)) m = s.pop() if n > m: continue self.assertEqual(n, m) break check_output(1) check_output(2) # Verify the getOrCreate() recovers from checkpoint files self.ssc.stop(True, True) time.sleep(1) self.setupCalled = False self.ssc = SnappyStreamingContext.getOrCreate(self.cpd, setup) self.assertFalse(self.setupCalled) self.ssc.start() check_output(3) # Verify that getOrCreate() uses existing SparkContext self.ssc.stop(True, True) time.sleep(1) self.sc = SparkContext(conf=SparkConf()) self.setupCalled = False self.ssc = SnappyStreamingContext.getOrCreate(self.cpd, setup) self.assertFalse(self.setupCalled) self.assertTrue(self.ssc.sparkContext == self.sc) # Verify the getActiveOrCreate() recovers from checkpoint files self.ssc.stop(True, True) time.sleep(1) self.setupCalled = False self.ssc = SnappyStreamingContext.getActiveOrCreate(self.cpd, setup) self.assertFalse(self.setupCalled) self.ssc.start() check_output(4) # Verify that getActiveOrCreate() returns active context self.setupCalled = False self.assertEqual(SnappyStreamingContext.getActiveOrCreate(self.cpd, setup), self.ssc) self.assertFalse(self.setupCalled) # Verify that getActiveOrCreate() uses existing SparkContext self.ssc.stop(True, True) time.sleep(1) self.sc = SparkContext(conf=SparkConf()) self.setupCalled = False self.ssc = SnappyStreamingContext.getActiveOrCreate(self.cpd, setup) self.assertFalse(self.setupCalled) self.assertTrue(self.ssc.sparkContext == self.sc) # Verify that getActiveOrCreate() calls setup() in absence of checkpoint files self.ssc.stop(True, True) shutil.rmtree(self.cpd) # delete checkpoint directory time.sleep(1) self.setupCalled = False self.ssc = SnappyStreamingContext.getActiveOrCreate(self.cpd, setup) self.assertTrue(self.setupCalled) # Stop everything self.ssc.stop(True, True)