def _writeAndVerify(self, ports): # Set up the streaming context and input streams ssc = StreamingContext(self.sc, self.duration) try: addresses = [("localhost", port) for port in ports] dstream = FlumeUtils.createPollingStream( ssc, addresses, maxBatchSize=self._utils.eventsPerBatch(), parallelism=5) outputBuffer = [] def get_output(_, rdd): for e in rdd.collect(): outputBuffer.append(e) dstream.foreachRDD(get_output) ssc.start() self._utils.sendDatAndEnsureAllDataHasBeenReceived() self.wait_for(outputBuffer, self._utils.getTotalEvents()) outputHeaders = [event[0] for event in outputBuffer] outputBodies = [event[1] for event in outputBuffer] self._utils.assertOutput(outputHeaders, outputBodies) finally: ssc.stop(False)
def setUp(self): class_name = self.__class__.__name__ conf = SparkConf().set("spark.default.parallelism", 1) self.sc = SparkContext(appName=class_name, conf=conf) self.sc.setCheckpointDir("/tmp") # TODO: decrease duration to speed up tests self.ssc = StreamingContext(self.sc, self.duration)
def test_from_no_conf_constructor(self): self.ssc = StreamingContext(master=self.master, appName=self.appName, duration=self.batachDuration) # Alternative call master: ssc.sparkContext.master # I try to make code close to Scala. self.assertEqual(self.ssc.sparkContext._conf.get("spark.master"), self.master) self.assertEqual(self.ssc.sparkContext._conf.get("spark.app.name"), self.appName)
def test_stop_multiple_times(self): self.ssc = StreamingContext(master=self.master, appName=self.appName, duration=self.batachDuration) self._addInputStream(self.ssc) self.ssc.start() self.ssc.stop() self.ssc.stop()
def test_stop_only_streaming_context(self): self.sc = SparkContext(master=self.master, appName=self.appName) self.ssc = StreamingContext(sparkContext=self.sc, duration=self.batachDuration) self._addInputStream(self.ssc) self.ssc.start() self.ssc.stop(False) self.assertEqual(len(self.sc.parallelize(range(5), 5).glom().collect()), 5)
def test_from_conf_with_settings(self): conf = SparkConf() conf.set("spark.cleaner.ttl", "10") conf.setMaster(self.master) conf.setAppName(self.appName) self.ssc = StreamingContext(conf=conf, duration=self.batachDuration) self.assertEqual(int(self.ssc.sparkContext._conf.get("spark.cleaner.ttl")), 10)
def test_slice(self): """Basic operation test for DStream.slice.""" import datetime as dt self.ssc = StreamingContext(self.sc, 1.0) self.ssc.remember(4.0) input = [[1], [2], [3], [4]] stream = self.ssc.queueStream( [self.sc.parallelize(d, 1) for d in input]) time_vals = [] def get_times(t, rdd): if rdd and len(time_vals) < len(input): time_vals.append(t) stream.foreachRDD(get_times) self.ssc.start() self.wait_for(time_vals, 4) begin_time = time_vals[0] def get_sliced(begin_delta, end_delta): begin = begin_time + dt.timedelta(seconds=begin_delta) end = begin_time + dt.timedelta(seconds=end_delta) rdds = stream.slice(begin, end) result_list = [rdd.collect() for rdd in rdds] return [r for result in result_list for r in result] self.assertEqual(set([1]), set(get_sliced(0, 0))) self.assertEqual(set([2, 3]), set(get_sliced(1, 2))) self.assertEqual(set([2, 3, 4]), set(get_sliced(1, 4))) self.assertEqual(set([1, 2, 3, 4]), set(get_sliced(0, 4)))
def setUp(self): super(DeleteFromCassandraStreamingTest, self).setUp() self.ssc = StreamingContext(self.sc, self.interval) self.rdds = [ self.sc.parallelize(range(0, self.size)).map(lambda i: { 'key': i, 'int': i, 'text': i }) ] data = self.rdds[0] data.saveToCassandra(self.keyspace, self.table) # verify the RDD length and actual content data = self.rdd() self.assertEqual(len(data.collect()), self.size) # verify we have actually data for `text` and `int` row = data.select('text', 'int').where('key=?', '0').first() self.assertEqual(row.text, u'0') self.assertEqual(row.int, 0) # stream we will use in tests. self.stream = self.ssc.queueStream(self.rdds)
def test_get_or_create(self): inputd = tempfile.mkdtemp() outputd = tempfile.mkdtemp() + "/" def updater(vs, s): return sum(vs, s or 0) def setup(): conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 0.5) dstream = ssc.textFileStream(inputd).map(lambda x: (x, 1)) wc = dstream.updateStateByKey(updater) wc.map(lambda x: "%s,%d" % x).saveAsTextFiles(outputd + "test") wc.checkpoint(.5) return ssc cpd = tempfile.mkdtemp("test_streaming_cps") ssc = StreamingContext.getOrCreate(cpd, setup) ssc.start() def check_output(n): while not os.listdir(outputd): time.sleep(0.01) time.sleep(1) # make sure mtime is larger than the previous one with open(os.path.join(inputd, str(n)), 'w') as f: f.writelines(["%d\n" % i for i in range(10)]) while True: p = os.path.join(outputd, max(os.listdir(outputd))) if '_SUCCESS' not in os.listdir(p): # not finished time.sleep(0.01) continue ordd = ssc.sparkContext.textFile(p).map( lambda line: line.split(",")) d = ordd.values().map(int).collect() if not d: time.sleep(0.01) continue self.assertEqual(10, len(d)) s = set(d) self.assertEqual(1, len(s)) m = s.pop() if n > m: continue self.assertEqual(n, m) break check_output(1) check_output(2) ssc.stop(True, True) time.sleep(1) ssc = StreamingContext.getOrCreate(cpd, setup) ssc.start() check_output(3) ssc.stop(True, True)
def test_get_or_create(self): inputd = tempfile.mkdtemp() outputd = tempfile.mkdtemp() + "/" def updater(vs, s): return sum(vs, s or 0) def setup(): conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 0.5) dstream = ssc.textFileStream(inputd).map(lambda x: (x, 1)) wc = dstream.updateStateByKey(updater) wc.map(lambda x: "%s,%d" % x).saveAsTextFiles(outputd + "test") wc.checkpoint(0.5) return ssc cpd = tempfile.mkdtemp("test_streaming_cps") ssc = StreamingContext.getOrCreate(cpd, setup) ssc.start() def check_output(n): while not os.listdir(outputd): time.sleep(0.01) time.sleep(1) # make sure mtime is larger than the previous one with open(os.path.join(inputd, str(n)), "w") as f: f.writelines(["%d\n" % i for i in range(10)]) while True: p = os.path.join(outputd, max(os.listdir(outputd))) if "_SUCCESS" not in os.listdir(p): # not finished time.sleep(0.01) continue ordd = ssc.sparkContext.textFile(p).map(lambda line: line.split(",")) d = ordd.values().map(int).collect() if not d: time.sleep(0.01) continue self.assertEqual(10, len(d)) s = set(d) self.assertEqual(1, len(s)) m = s.pop() if n > m: continue self.assertEqual(n, m) break check_output(1) check_output(2) ssc.stop(True, True) time.sleep(1) ssc = StreamingContext.getOrCreate(cpd, setup) ssc.start() check_output(3) ssc.stop(True, True)
def setup(): conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 0.5) dstream = ssc.textFileStream(inputd).map(lambda x: (x, 1)) wc = dstream.updateStateByKey(updater) wc.map(lambda x: "%s,%d" % x).saveAsTextFiles(outputd + "test") wc.checkpoint(0.5) return ssc
def setup(): conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 0.5) dstream = ssc.textFileStream(inputd).map(lambda x: (x, 1)) wc = dstream.updateStateByKey(updater) wc.map(lambda x: "%s,%d" % x).saveAsTextFiles(outputd + "test") wc.checkpoint(.5) return ssc
def setup(): conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 0.5) # A function that cannot be serialized def process(time, rdd): sc.parallelize(range(1, 10)) ssc.textFileStream(inputd).foreachRDD(process) return ssc
def test_text_file_stream(self): d = tempfile.mkdtemp() self.ssc = StreamingContext(self.sc, self.duration) dstream2 = self.ssc.textFileStream(d).map(int) result = self._collect(dstream2, 2, block=False) self.ssc.start() for name in ('a', 'b'): time.sleep(1) with open(os.path.join(d, name), "w") as f: f.writelines(["%d\n" % i for i in range(10)]) self.wait_for(result, 2) self.assertEqual([list(range(10)), list(range(10))], result)
def test_binary_records_stream(self): d = tempfile.mkdtemp() self.ssc = StreamingContext(self.sc, self.duration) dstream = self.ssc.binaryRecordsStream(d, 10).map( lambda v: struct.unpack("10b", bytes(v))) result = self._collect(dstream, 2, block=False) self.ssc.start() for name in ('a', 'b'): time.sleep(1) with open(os.path.join(d, name), "wb") as f: f.write(bytearray(range(10))) self.wait_for(result, 2) self.assertEqual([list(range(10)), list(range(10))], [list(v[0]) for v in result])
def test_transform_function_serializer_failure(self): inputd = tempfile.mkdtemp() self.cpd = tempfile.mkdtemp( "test_transform_function_serializer_failure") def setup(): conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 0.5) # A function that cannot be serialized def process(time, rdd): sc.parallelize(range(1, 10)) ssc.textFileStream(inputd).foreachRDD(process) return ssc self.ssc = StreamingContext.getOrCreate(self.cpd, setup) try: self.ssc.start() except: import traceback failure = traceback.format_exc() self.assertTrue( "It appears that you are attempting to reference SparkContext" in failure) return self.fail( "using SparkContext in process should fail because it's not Serializable" )
def test_slice(self): """Basic operation test for DStream.slice.""" import datetime as dt self.ssc = StreamingContext(self.sc, 1.0) self.ssc.remember(4.0) input = [[1], [2], [3], [4]] stream = self.ssc.queueStream([self.sc.parallelize(d, 1) for d in input]) time_vals = [] def get_times(t, rdd): if rdd and len(time_vals) < len(input): time_vals.append(t) stream.foreachRDD(get_times) self.ssc.start() self.wait_for(time_vals, 4) begin_time = time_vals[0] def get_sliced(begin_delta, end_delta): begin = begin_time + dt.timedelta(seconds=begin_delta) end = begin_time + dt.timedelta(seconds=end_delta) rdds = stream.slice(begin, end) result_list = [rdd.collect() for rdd in rdds] return [r for result in result_list for r in result] self.assertEqual(set([1]), set(get_sliced(0, 0))) self.assertEqual(set([2, 3]), set(get_sliced(1, 2))) self.assertEqual(set([2, 3, 4]), set(get_sliced(1, 4))) self.assertEqual(set([1, 2, 3, 4]), set(get_sliced(0, 4)))
def test_transform_function_serializer_failure(self): inputd = tempfile.mkdtemp() self.cpd = tempfile.mkdtemp("test_transform_function_serializer_failure") def setup(): conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 0.5) # A function that cannot be serialized def process(time, rdd): sc.parallelize(range(1, 10)) ssc.textFileStream(inputd).foreachRDD(process) return ssc self.ssc = StreamingContext.getOrCreate(self.cpd, setup) try: self.ssc.start() except: import traceback failure = traceback.format_exc() self.assertTrue( "It appears that you are attempting to reference SparkContext" in failure) return self.fail("using SparkContext in process should fail because it's not Serializable")
def main(argv): port = 9999 out_file = "myout2" port = int(argv[0]) out_file = argv[1] with open(out_file, "w") as fout: fout.close() conf = SparkConf().setMaster("local[*]") \ .setAppName("Flajolet-Martin") \ .set("spark.executor.memory", "4g") \ .set("spark.driver.memory", "4g") sc = SparkContext(conf=conf) sc.setLogLevel("OFF") ssc = StreamingContext(sc , BATCH_DURATION) stream = ssc.socketTextStream("localhost", port) \ .window(WINDOW_LENGTH, SLIDING_INTERVAL) \ .map(lambda x: json.loads(x)) hashParams = hashFuncs() with open(out_file, 'a') as fout: output = csv.writer(fout) output.writerow(["Time", "Ground Truth", "Estimation"]) fout.close() stream.map(lambda x: x["city"]).filter(lambda x: x != "") \ .foreachRDD(lambda rdd: Flajolet_Martin(rdd, hashParams, out_file)) ssc.start() ssc.awaitTermination()
def createSSC(): # ssc 생성 conf = SparkConf() sc = SparkContext(master="local[*]", appName="CheckpointSample", conf=conf) ssc = StreamingContext(sc, 3) # DStream 생성 ids1 = ssc.socketTextStream("127.0.0.1", 9000) ids2 = ids1.flatMap(lambda v: v.split(" ")).map(lambda v: (v, 1)) # updateStateByKey ids2.updateStateByKey(updateFunc).pprint() # checkpoint ssc.checkpoint("./checkPoints/checkPointSample/Python") # return return ssc
def _create_spark_context(self, spark_config, stream, stream_duration): if stream is True: self.streaming_context = StreamingContext( SparkContext(conf=pyspark.SparkConf().setAll( spark_config.items())).getOrCreate(), stream_duration) self.spark = SparkSession(self.streaming_context.sparkContext) else: self.spark = SparkSession.builder \ .config(conf=pyspark.SparkConf().setAll(spark_config.items())) \ .enableHiveSupport().getOrCreate()
def test_binary_records_stream(self): d = tempfile.mkdtemp() self.ssc = StreamingContext(self.sc, self.duration) dstream = self.ssc.binaryRecordsStream(d, 10).map(lambda v: struct.unpack("10b", bytes(v))) result = self._collect(dstream, 2, block=False) self.ssc.start() for name in ("a", "b"): time.sleep(1) with open(os.path.join(d, name), "wb") as f: f.write(bytearray(range(10))) self.wait_for(result, 2) self.assertEqual([list(range(10)), list(range(10))], [list(v[0]) for v in result])
def test_get_active_or_create(self): # Test StreamingContext.getActiveOrCreate() without checkpoint data # See CheckpointTests for tests with checkpoint data self.ssc = None self.assertEqual(StreamingContext.getActive(), None) def setupFunc(): ssc = StreamingContext(self.sc, self.duration) ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.setupCalled = True return ssc # Verify that getActiveOrCreate() (w/o checkpoint) calls setupFunc when no context is active self.setupCalled = False self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc) self.assertTrue(self.setupCalled) # Verify that getActiveOrCreate() returns active context and does not call the setupFunc self.ssc.start() self.setupCalled = False self.assertEqual(StreamingContext.getActiveOrCreate(None, setupFunc), self.ssc) self.assertFalse(self.setupCalled) # Verify that getActiveOrCreate() calls setupFunc after active context is stopped self.ssc.stop(False) self.setupCalled = False self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc) self.assertTrue(self.setupCalled) # Verify that if the Java context is stopped, then getActive() returns None self.ssc = StreamingContext(self.sc, self.duration) self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.ssc.start() self.assertEqual(StreamingContext.getActive(), self.ssc) self.ssc._jssc.stop(False) self.setupCalled = False self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc) self.assertTrue(self.setupCalled)
def test_get_active(self): self.assertEqual(StreamingContext.getActive(), None) # Verify that getActive() returns the active context self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.ssc.start() self.assertEqual(StreamingContext.getActive(), self.ssc) # Verify that getActive() returns None self.ssc.stop(False) self.assertEqual(StreamingContext.getActive(), None) # Verify that if the Java context is stopped, then getActive() returns None self.ssc = StreamingContext(self.sc, self.duration) self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.ssc.start() self.assertEqual(StreamingContext.getActive(), self.ssc) self.ssc._jssc.stop(False) self.assertEqual(StreamingContext.getActive(), None)
def test_get_active_or_create(self): # Test StreamingContext.getActiveOrCreate() without checkpoint data # See CheckpointTests for tests with checkpoint data self.ssc = None self.assertEqual(StreamingContext.getActive(), None) def setupFunc(): ssc = StreamingContext(self.sc, self.duration) ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.setupCalled = True return ssc # Verify that getActiveOrCreate() (w/o checkpoint) calls setupFunc when no context is active self.setupCalled = False self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc) self.assertTrue(self.setupCalled) # Verify that getActiveOrCreate() retuns active context and does not call the setupFunc self.ssc.start() self.setupCalled = False self.assertEqual(StreamingContext.getActiveOrCreate(None, setupFunc), self.ssc) self.assertFalse(self.setupCalled) # Verify that getActiveOrCreate() calls setupFunc after active context is stopped self.ssc.stop(False) self.setupCalled = False self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc) self.assertTrue(self.setupCalled) # Verify that if the Java context is stopped, then getActive() returns None self.ssc = StreamingContext(self.sc, self.duration) self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.ssc.start() self.assertEqual(StreamingContext.getActive(), self.ssc) self.ssc._jssc.stop(False) self.setupCalled = False self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc) self.assertTrue(self.setupCalled)
from pyspark.streaming.context import StreamingContext def updateFunc(newValues, currentValue): if currentValue is None: currentValue = 0 return sum(newValues, currentValue) def createSSC(): # ssc 생성 conf = SparkConf() sc = SparkContext(master="local[*]", appName="CheckpointSample", conf=conf) ssc = StreamingContext(sc, 3) # DStream 생성 ids1 = ssc.socketTextStream("127.0.0.1", 9000) ids2 = ids1.flatMap(lambda v: v.split(" ")).map(lambda v: (v, 1)) # updateStateByKey ids2.updateStateByKey(updateFunc).pprint() # checkpoint ssc.checkpoint("./checkPoints/checkPointSample/Python") # return return ssc ssc = StreamingContext.getOrCreate("./checkPoints/checkPointSample/Python", createSSC) ssc.start() ssc.awaitTerminationOrTimeout()
class BasicOperationTests(PySparkStreamingTestCase): def test_map(self): """Basic operation test for DStream.map.""" input = [range(1, 5), range(5, 9), range(9, 13)] def func(dstream): return dstream.map(str) expected = [list(map(str, x)) for x in input] self._test_func(input, func, expected) def test_flatMap(self): """Basic operation test for DStream.flatMap.""" input = [range(1, 5), range(5, 9), range(9, 13)] def func(dstream): return dstream.flatMap(lambda x: (x, x * 2)) expected = [ list(chain.from_iterable((map(lambda y: [y, y * 2], x)))) for x in input ] self._test_func(input, func, expected) def test_filter(self): """Basic operation test for DStream.filter.""" input = [range(1, 5), range(5, 9), range(9, 13)] def func(dstream): return dstream.filter(lambda x: x % 2 == 0) expected = [[y for y in x if y % 2 == 0] for x in input] self._test_func(input, func, expected) def test_count(self): """Basic operation test for DStream.count.""" input = [range(5), range(10), range(20)] def func(dstream): return dstream.count() expected = [[len(x)] for x in input] self._test_func(input, func, expected) def test_slice(self): """Basic operation test for DStream.slice.""" import datetime as dt self.ssc = StreamingContext(self.sc, 1.0) self.ssc.remember(4.0) input = [[1], [2], [3], [4]] stream = self.ssc.queueStream( [self.sc.parallelize(d, 1) for d in input]) time_vals = [] def get_times(t, rdd): if rdd and len(time_vals) < len(input): time_vals.append(t) stream.foreachRDD(get_times) self.ssc.start() self.wait_for(time_vals, 4) begin_time = time_vals[0] def get_sliced(begin_delta, end_delta): begin = begin_time + dt.timedelta(seconds=begin_delta) end = begin_time + dt.timedelta(seconds=end_delta) rdds = stream.slice(begin, end) result_list = [rdd.collect() for rdd in rdds] return [r for result in result_list for r in result] self.assertEqual(set([1]), set(get_sliced(0, 0))) self.assertEqual(set([2, 3]), set(get_sliced(1, 2))) self.assertEqual(set([2, 3, 4]), set(get_sliced(1, 4))) self.assertEqual(set([1, 2, 3, 4]), set(get_sliced(0, 4))) def test_reduce(self): """Basic operation test for DStream.reduce.""" input = [range(1, 5), range(5, 9), range(9, 13)] def func(dstream): return dstream.reduce(operator.add) expected = [[reduce(operator.add, x)] for x in input] self._test_func(input, func, expected) def test_reduceByKey(self): """Basic operation test for DStream.reduceByKey.""" input = [[("a", 1), ("a", 1), ("b", 1), ("b", 1)], [("", 1), ("", 1), ("", 1), ("", 1)], [(1, 1), (1, 1), (2, 1), (2, 1), (3, 1)]] def func(dstream): return dstream.reduceByKey(operator.add) expected = [[("a", 2), ("b", 2)], [("", 4)], [(1, 2), (2, 2), (3, 1)]] self._test_func(input, func, expected, sort=True) def test_mapValues(self): """Basic operation test for DStream.mapValues.""" input = [[("a", 2), ("b", 2), ("c", 1), ("d", 1)], [(0, 4), (1, 1), (2, 2), (3, 3)], [(1, 1), (2, 1), (3, 1), (4, 1)]] def func(dstream): return dstream.mapValues(lambda x: x + 10) expected = [[("a", 12), ("b", 12), ("c", 11), ("d", 11)], [(0, 14), (1, 11), (2, 12), (3, 13)], [(1, 11), (2, 11), (3, 11), (4, 11)]] self._test_func(input, func, expected, sort=True) def test_flatMapValues(self): """Basic operation test for DStream.flatMapValues.""" input = [[("a", 2), ("b", 2), ("c", 1), ("d", 1)], [(0, 4), (1, 1), (2, 1), (3, 1)], [(1, 1), (2, 1), (3, 1), (4, 1)]] def func(dstream): return dstream.flatMapValues(lambda x: (x, x + 10)) expected = [[("a", 2), ("a", 12), ("b", 2), ("b", 12), ("c", 1), ("c", 11), ("d", 1), ("d", 11)], [(0, 4), (0, 14), (1, 1), (1, 11), (2, 1), (2, 11), (3, 1), (3, 11)], [(1, 1), (1, 11), (2, 1), (2, 11), (3, 1), (3, 11), (4, 1), (4, 11)]] self._test_func(input, func, expected) def test_glom(self): """Basic operation test for DStream.glom.""" input = [range(1, 5), range(5, 9), range(9, 13)] rdds = [self.sc.parallelize(r, 2) for r in input] def func(dstream): return dstream.glom() expected = [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[9, 10], [11, 12]]] self._test_func(rdds, func, expected) def test_mapPartitions(self): """Basic operation test for DStream.mapPartitions.""" input = [range(1, 5), range(5, 9), range(9, 13)] rdds = [self.sc.parallelize(r, 2) for r in input] def func(dstream): def f(iterator): yield sum(iterator) return dstream.mapPartitions(f) expected = [[3, 7], [11, 15], [19, 23]] self._test_func(rdds, func, expected) def test_countByValue(self): """Basic operation test for DStream.countByValue.""" input = [ list(range(1, 5)) * 2, list(range(5, 7)) + list(range(5, 9)), ["a", "a", "b", ""] ] def func(dstream): return dstream.countByValue() expected = [[(1, 2), (2, 2), (3, 2), (4, 2)], [(5, 2), (6, 2), (7, 1), (8, 1)], [("a", 2), ("b", 1), ("", 1)]] self._test_func(input, func, expected, sort=True) def test_groupByKey(self): """Basic operation test for DStream.groupByKey.""" input = [[(1, 1), (2, 1), (3, 1), (4, 1)], [(1, 1), (1, 1), (1, 1), (2, 1), (2, 1), (3, 1)], [("a", 1), ("a", 1), ("b", 1), ("", 1), ("", 1), ("", 1)]] def func(dstream): return dstream.groupByKey().mapValues(list) expected = [[(1, [1]), (2, [1]), (3, [1]), (4, [1])], [(1, [1, 1, 1]), (2, [1, 1]), (3, [1])], [("a", [1, 1]), ("b", [1]), ("", [1, 1, 1])]] self._test_func(input, func, expected, sort=True) def test_combineByKey(self): """Basic operation test for DStream.combineByKey.""" input = [[(1, 1), (2, 1), (3, 1), (4, 1)], [(1, 1), (1, 1), (1, 1), (2, 1), (2, 1), (3, 1)], [("a", 1), ("a", 1), ("b", 1), ("", 1), ("", 1), ("", 1)]] def func(dstream): def add(a, b): return a + str(b) return dstream.combineByKey(str, add, add) expected = [[(1, "1"), (2, "1"), (3, "1"), (4, "1")], [(1, "111"), (2, "11"), (3, "1")], [("a", "11"), ("b", "1"), ("", "111")]] self._test_func(input, func, expected, sort=True) def test_repartition(self): input = [range(1, 5), range(5, 9)] rdds = [self.sc.parallelize(r, 2) for r in input] def func(dstream): return dstream.repartition(1).glom() expected = [[[1, 2, 3, 4]], [[5, 6, 7, 8]]] self._test_func(rdds, func, expected) def test_union(self): input1 = [range(3), range(5), range(6)] input2 = [range(3, 6), range(5, 6)] def func(d1, d2): return d1.union(d2) expected = [list(range(6)), list(range(6)), list(range(6))] self._test_func(input1, func, expected, input2=input2) def test_cogroup(self): input = [[(1, 1), (2, 1), (3, 1)], [(1, 1), (1, 1), (1, 1), (2, 1)], [("a", 1), ("a", 1), ("b", 1), ("", 1), ("", 1)]] input2 = [[(1, 2)], [(4, 1)], [("a", 1), ("a", 1), ("b", 1), ("", 1), ("", 2)]] def func(d1, d2): return d1.cogroup(d2).mapValues(lambda vs: tuple(map(list, vs))) expected = [[(1, ([1], [2])), (2, ([1], [])), (3, ([1], []))], [(1, ([1, 1, 1], [])), (2, ([1], [])), (4, ([], [1]))], [("a", ([1, 1], [1, 1])), ("b", ([1], [1])), ("", ([1, 1], [1, 2]))]] self._test_func(input, func, expected, sort=True, input2=input2) def test_join(self): input = [[('a', 1), ('b', 2)]] input2 = [[('b', 3), ('c', 4)]] def func(a, b): return a.join(b) expected = [[('b', (2, 3))]] self._test_func(input, func, expected, True, input2) def test_left_outer_join(self): input = [[('a', 1), ('b', 2)]] input2 = [[('b', 3), ('c', 4)]] def func(a, b): return a.leftOuterJoin(b) expected = [[('a', (1, None)), ('b', (2, 3))]] self._test_func(input, func, expected, True, input2) def test_right_outer_join(self): input = [[('a', 1), ('b', 2)]] input2 = [[('b', 3), ('c', 4)]] def func(a, b): return a.rightOuterJoin(b) expected = [[('b', (2, 3)), ('c', (None, 4))]] self._test_func(input, func, expected, True, input2) def test_full_outer_join(self): input = [[('a', 1), ('b', 2)]] input2 = [[('b', 3), ('c', 4)]] def func(a, b): return a.fullOuterJoin(b) expected = [[('a', (1, None)), ('b', (2, 3)), ('c', (None, 4))]] self._test_func(input, func, expected, True, input2) def test_update_state_by_key(self): def updater(vs, s): if not s: s = [] s.extend(vs) return s input = [[('k', i)] for i in range(5)] def func(dstream): return dstream.updateStateByKey(updater) expected = [[0], [0, 1], [0, 1, 2], [0, 1, 2, 3], [0, 1, 2, 3, 4]] expected = [[('k', v)] for v in expected] self._test_func(input, func, expected) def test_update_state_by_key_initial_rdd(self): def updater(vs, s): if not s: s = [] s.extend(vs) return s initial = [('k', [0, 1])] initial = self.sc.parallelize(initial, 1) input = [[('k', i)] for i in range(2, 5)] def func(dstream): return dstream.updateStateByKey(updater, initialRDD=initial) expected = [[0, 1, 2], [0, 1, 2, 3], [0, 1, 2, 3, 4]] expected = [[('k', v)] for v in expected] self._test_func(input, func, expected) def test_failed_func(self): # Test failure in # TransformFunction.apply(rdd: Option[RDD[_]], time: Time) input = [self.sc.parallelize([d], 1) for d in range(4)] input_stream = self.ssc.queueStream(input) def failed_func(i): raise ValueError("This is a special error") input_stream.map(failed_func).pprint() self.ssc.start() try: self.ssc.awaitTerminationOrTimeout(10) except: import traceback failure = traceback.format_exc() self.assertTrue("This is a special error" in failure) return self.fail("a failed func should throw an error") def test_failed_func2(self): # Test failure in # TransformFunction.apply(rdd: Option[RDD[_]], rdd2: Option[RDD[_]], time: Time) input = [self.sc.parallelize([d], 1) for d in range(4)] input_stream1 = self.ssc.queueStream(input) input_stream2 = self.ssc.queueStream(input) def failed_func(rdd1, rdd2): raise ValueError("This is a special error") input_stream1.transformWith(failed_func, input_stream2, True).pprint() self.ssc.start() try: self.ssc.awaitTerminationOrTimeout(10) except: import traceback failure = traceback.format_exc() self.assertTrue("This is a special error" in failure) return self.fail("a failed func should throw an error") def test_failed_func_with_reseting_failure(self): input = [self.sc.parallelize([d], 1) for d in range(4)] input_stream = self.ssc.queueStream(input) def failed_func(i): if i == 1: # Make it fail in the second batch raise ValueError("This is a special error") else: return i # We should be able to see the results of the 3rd and 4th batches even if the second batch # fails expected = [[0], [2], [3]] self.assertEqual(expected, self._collect(input_stream.map(failed_func), 3)) try: self.ssc.awaitTerminationOrTimeout(10) except: import traceback failure = traceback.format_exc() self.assertTrue("This is a special error" in failure) return self.fail("a failed func should throw an error")
from pyspark.sql import SparkSession from pyspark.streaming.context import StreamingContext from pyspark.streaming.kinesis import KinesisUtils, InitialPositionInStream spark = SparkSession.builder.appName("test").master("local[*]").getOrCreate() ssc = StreamingContext(spark.sparkContext, 10) lines = KinesisUtils.createStream( ssc, "test", "test_s", "https://kinesis.eu-north-1.amazonaws.com", "eu-north-1", InitialPositionInStream.LATEST, awsAccessKeyId="AKIAJ5V6NEAI3YNTWGDA", awsSecretKey="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", checkpointInterval=2)
# -*- coding: utf-8 -*- from __future__ import absolute_import, print_function, division, unicode_literals import sys from pyspark import SparkConf, SparkContext from pyspark.storagelevel import StorageLevel from pyspark.streaming.context import StreamingContext if __name__ == '__main__': conf = SparkConf().setAppName('Network Word Count') sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 5) lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]), StorageLevel.MEMORY_AND_DISK_SER) word_counts = ( lines.flatMap(lambda l: l.split(' ')) .map(lambda w: (w, 1L)) .reduceByKey(lambda x, y: x + y) ) word_counts.pprint() ssc.start() ssc.awaitTermination()
def setupFunc(): ssc = StreamingContext(self.sc, self.duration) ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.setupCalled = True return ssc
class StreamingTestCase(SparkTestingBaseReuse): """Basic common test case for Spark Streaming tests. Provides a Spark Streaming context as well as some helper methods for creating streaming input and collecting streaming output. Modeled after PySparkStreamingTestCase.""" timeout = 15 # seconds duration = .5 @classmethod def setUpClass(cls): super(StreamingTestCase, cls).setUpClass() cls._checkpointDir = tempfile.mkdtemp() cls.sc.setCheckpointDir(cls._checkpointDir) @classmethod def tearDownClass(cls): shutil.rmtree(cls._checkpointDir) super(StreamingTestCase, cls).tearDownClass() @classmethod def _sort_result_based_on_key(cls, result): return map(lambda x: sorted(x), result) def setUp(self): self.ssc = StreamingContext(self.sc, self.duration) def tearDown(self): self.ssc.stop(False) def wait_for(self, result, n): start_time = time.time() while len(result) < n and time.time() - start_time < self.timeout: time.sleep(0.01) if len(result) < n: print("timeout after", self.timeout) def _take(self, dstream, n): """ Return the first `n` elements in the stream (will start and stop). """ results = [] def take(_, rdd): if rdd and len(results) < n: results.extend(rdd.take(n - len(results))) dstream.foreachRDD(take) self.ssc.start() self.wait_for(results, n) return results def _collect(self, dstream, n, block=True): """ Collect each RDDs into the returned list. :return: list, which will have the collected items. """ result = [] def get_output(_, rdd): if rdd and len(result) < n: r = rdd.collect() if r: result.append(r) dstream.foreachRDD(get_output) if not block: return result self.ssc.start() self.wait_for(result, n) return result def run_func(self, input, func, expected, sort=False, input2=None): """ @param input: dataset for the test. This should be list of lists or list of RDDs. @param input2: Optional second dataset for the test. If provided your func must take two PythonDStreams as input. @param func: wrapped function. This function should return PythonDStream. @param expected: expected output for this testcase. Warning: If output is longer than expected this will silently discard the additional output. TODO: fail when this happens. """ if not isinstance(input[0], RDD): input = [self.sc.parallelize(d, 1) for d in input] input_stream = self.ssc.queueStream(input) if input2 and not isinstance(input2[0], RDD): input2 = [self.sc.parallelize(d, 1) for d in input2] # Apply test function to stream. if input2: input_stream2 = self.ssc.queueStream(input2) stream = func(input_stream, input_stream2) else: stream = func(input_stream) result = self._collect(stream, len(expected)) if sort: self._sort_result_based_on_key(result) self._sort_result_based_on_key(expected) self.assertEqual(expected, result)
def setUp(self): self.ssc = StreamingContext(self.sc, self.duration)
class StreamingTestCase(SparkTestingBaseReuse): """Basic common test case for Spark Streaming tests. Provides a Spark Streaming context as well as some helper methods for creating streaming input and collecting streaming output. Modeled after PySparkStreamingTestCase.""" timeout = 15 # seconds duration = .5 @classmethod def setUpClass(cls): super(StreamingTestCase, cls).setUpClass() cls.sc.setCheckpointDir("/tmp") @classmethod def tearDownClass(cls): super(StreamingTestCase, cls).tearDownClass() @classmethod def _sort_result_based_on_key(cls, result): return map(lambda x: sorted(x), result) def setUp(self): self.ssc = StreamingContext(self.sc, self.duration) def tearDown(self): self.ssc.stop(False) def wait_for(self, result, n): start_time = time.time() while len(result) < n and time.time() - start_time < self.timeout: time.sleep(0.01) if len(result) < n: print("timeout after", self.timeout) def _take(self, dstream, n): """ Return the first `n` elements in the stream (will start and stop). """ results = [] def take(_, rdd): if rdd and len(results) < n: results.extend(rdd.take(n - len(results))) dstream.foreachRDD(take) self.ssc.start() self.wait_for(results, n) return results def _collect(self, dstream, n, block=True): """ Collect each RDDs into the returned list. :return: list, which will have the collected items. """ result = [] def get_output(_, rdd): if rdd and len(result) < n: r = rdd.collect() if r: result.append(r) dstream.foreachRDD(get_output) if not block: return result self.ssc.start() self.wait_for(result, n) return result def run_func(self, input, func, expected, sort=False, input2=None): """ @param input: dataset for the test. This should be list of lists or list of RDDs. @param input2: Optional second dataset for the test. If provided your func must take two PythonDStreams as input. @param func: wrapped function. This function should return PythonDStream. @param expected: expected output for this testcase. Warning: If output is longer than expected this will silently discard the additional output. TODO: fail when this happens. """ if not isinstance(input[0], RDD): input = [self.sc.parallelize(d, 1) for d in input] input_stream = self.ssc.queueStream(input) if input2 and not isinstance(input2[0], RDD): input2 = [self.sc.parallelize(d, 1) for d in input2] # Apply test function to stream. if input2: input_stream2 = self.ssc.queueStream(input2) stream = func(input_stream, input_stream2) else: stream = func(input_stream) result = self._collect(stream, len(expected)) if sort: self._sort_result_based_on_key(result) self._sort_result_based_on_key(expected) self.assertEqual(expected, result)
class StreamingContextTests(PySparkStreamingTestCase): duration = 0.1 def _add_input_stream(self): inputs = [range(1, x) for x in range(101)] stream = self.ssc.queueStream(inputs) self._collect(stream, 1, block=False) def test_stop_only_streaming_context(self): self._add_input_stream() self.ssc.start() self.ssc.stop(False) self.assertEqual( len(self.sc.parallelize(range(5), 5).glom().collect()), 5) def test_stop_multiple_times(self): self._add_input_stream() self.ssc.start() self.ssc.stop(False) self.ssc.stop(False) def test_queue_stream(self): input = [list(range(i + 1)) for i in range(3)] dstream = self.ssc.queueStream(input) result = self._collect(dstream, 3) self.assertEqual(input, result) def test_text_file_stream(self): d = tempfile.mkdtemp() self.ssc = StreamingContext(self.sc, self.duration) dstream2 = self.ssc.textFileStream(d).map(int) result = self._collect(dstream2, 2, block=False) self.ssc.start() for name in ('a', 'b'): time.sleep(1) with open(os.path.join(d, name), "w") as f: f.writelines(["%d\n" % i for i in range(10)]) self.wait_for(result, 2) self.assertEqual([list(range(10)), list(range(10))], result) def test_binary_records_stream(self): d = tempfile.mkdtemp() self.ssc = StreamingContext(self.sc, self.duration) dstream = self.ssc.binaryRecordsStream( d, 10).map(lambda v: struct.unpack("10b", bytes(v))) result = self._collect(dstream, 2, block=False) self.ssc.start() for name in ('a', 'b'): time.sleep(1) with open(os.path.join(d, name), "wb") as f: f.write(bytearray(range(10))) self.wait_for(result, 2) self.assertEqual([list(range(10)), list(range(10))], [list(v[0]) for v in result]) def test_union(self): input = [list(range(i + 1)) for i in range(3)] dstream = self.ssc.queueStream(input) dstream2 = self.ssc.queueStream(input) dstream3 = self.ssc.union(dstream, dstream2) result = self._collect(dstream3, 3) expected = [i * 2 for i in input] self.assertEqual(expected, result) def test_transform(self): dstream1 = self.ssc.queueStream([[1]]) dstream2 = self.ssc.queueStream([[2]]) dstream3 = self.ssc.queueStream([[3]]) def func(rdds): rdd1, rdd2, rdd3 = rdds return rdd2.union(rdd3).union(rdd1) dstream = self.ssc.transform([dstream1, dstream2, dstream3], func) self.assertEqual([2, 3, 1], self._take(dstream, 3))
class PySparkStreamingTestCase(unittest.TestCase): timeout = 10 # seconds duration = .5 @classmethod def setUpClass(cls): class_name = cls.__name__ conf = SparkConf().set("spark.default.parallelism", 1) cls.sc = SparkContext(appName=class_name, conf=conf) cls.sc.setCheckpointDir("/tmp") @classmethod def tearDownClass(cls): cls.sc.stop() # Clean up in the JVM just in case there has been some issues in Python API jSparkContextOption = SparkContext._jvm.SparkContext.get() if jSparkContextOption.nonEmpty(): jSparkContextOption.get().stop() def setUp(self): self.ssc = StreamingContext(self.sc, self.duration) def tearDown(self): if self.ssc is not None: self.ssc.stop(False) # Clean up in the JVM just in case there has been some issues in Python API jStreamingContextOption = StreamingContext._jvm.SparkContext.getActive() if jStreamingContextOption.nonEmpty(): jStreamingContextOption.get().stop(False) def wait_for(self, result, n): start_time = time.time() while len(result) < n and time.time() - start_time < self.timeout: time.sleep(0.01) if len(result) < n: print("timeout after", self.timeout) def _take(self, dstream, n): """ Return the first `n` elements in the stream (will start and stop). """ results = [] def take(_, rdd): if rdd and len(results) < n: results.extend(rdd.take(n - len(results))) dstream.foreachRDD(take) self.ssc.start() self.wait_for(results, n) return results def _collect(self, dstream, n, block=True): """ Collect each RDDs into the returned list. :return: list, which will have the collected items. """ result = [] def get_output(_, rdd): if rdd and len(result) < n: r = rdd.collect() if r: result.append(r) dstream.foreachRDD(get_output) if not block: return result self.ssc.start() self.wait_for(result, n) return result def _test_func(self, input, func, expected, sort=False, input2=None): """ @param input: dataset for the test. This should be list of lists. @param func: wrapped function. This function should return PythonDStream object. @param expected: expected output for this testcase. """ if not isinstance(input[0], RDD): input = [self.sc.parallelize(d, 1) for d in input] input_stream = self.ssc.queueStream(input) if input2 and not isinstance(input2[0], RDD): input2 = [self.sc.parallelize(d, 1) for d in input2] input_stream2 = self.ssc.queueStream(input2) if input2 is not None else None # Apply test function to stream. if input2: stream = func(input_stream, input_stream2) else: stream = func(input_stream) result = self._collect(stream, len(expected)) if sort: self._sort_result_based_on_key(result) self._sort_result_based_on_key(expected) self.assertEqual(expected, result) def _sort_result_based_on_key(self, outputs): """Sort the list based on first value.""" for output in outputs: output.sort(key=lambda x: x[0])
class StreamingContextTests(PySparkStreamingTestCase): duration = 0.1 setupCalled = False def _add_input_stream(self): inputs = [range(1, x) for x in range(101)] stream = self.ssc.queueStream(inputs) self._collect(stream, 1, block=False) def test_stop_only_streaming_context(self): self._add_input_stream() self.ssc.start() self.ssc.stop(False) self.assertEqual(len(self.sc.parallelize(range(5), 5).glom().collect()), 5) def test_stop_multiple_times(self): self._add_input_stream() self.ssc.start() self.ssc.stop(False) self.ssc.stop(False) def test_queue_stream(self): input = [list(range(i + 1)) for i in range(3)] dstream = self.ssc.queueStream(input) result = self._collect(dstream, 3) self.assertEqual(input, result) def test_text_file_stream(self): d = tempfile.mkdtemp() self.ssc = StreamingContext(self.sc, self.duration) dstream2 = self.ssc.textFileStream(d).map(int) result = self._collect(dstream2, 2, block=False) self.ssc.start() for name in ('a', 'b'): time.sleep(1) with open(os.path.join(d, name), "w") as f: f.writelines(["%d\n" % i for i in range(10)]) self.wait_for(result, 2) self.assertEqual([list(range(10)), list(range(10))], result) def test_binary_records_stream(self): d = tempfile.mkdtemp() self.ssc = StreamingContext(self.sc, self.duration) dstream = self.ssc.binaryRecordsStream(d, 10).map( lambda v: struct.unpack("10b", bytes(v))) result = self._collect(dstream, 2, block=False) self.ssc.start() for name in ('a', 'b'): time.sleep(1) with open(os.path.join(d, name), "wb") as f: f.write(bytearray(range(10))) self.wait_for(result, 2) self.assertEqual([list(range(10)), list(range(10))], [list(v[0]) for v in result]) def test_union(self): input = [list(range(i + 1)) for i in range(3)] dstream = self.ssc.queueStream(input) dstream2 = self.ssc.queueStream(input) dstream3 = self.ssc.union(dstream, dstream2) result = self._collect(dstream3, 3) expected = [i * 2 for i in input] self.assertEqual(expected, result) def test_transform(self): dstream1 = self.ssc.queueStream([[1]]) dstream2 = self.ssc.queueStream([[2]]) dstream3 = self.ssc.queueStream([[3]]) def func(rdds): rdd1, rdd2, rdd3 = rdds return rdd2.union(rdd3).union(rdd1) dstream = self.ssc.transform([dstream1, dstream2, dstream3], func) self.assertEqual([2, 3, 1], self._take(dstream, 3)) def test_get_active(self): self.assertEqual(StreamingContext.getActive(), None) # Verify that getActive() returns the active context self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.ssc.start() self.assertEqual(StreamingContext.getActive(), self.ssc) # Verify that getActive() returns None self.ssc.stop(False) self.assertEqual(StreamingContext.getActive(), None) # Verify that if the Java context is stopped, then getActive() returns None self.ssc = StreamingContext(self.sc, self.duration) self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.ssc.start() self.assertEqual(StreamingContext.getActive(), self.ssc) self.ssc._jssc.stop(False) self.assertEqual(StreamingContext.getActive(), None) def test_get_active_or_create(self): # Test StreamingContext.getActiveOrCreate() without checkpoint data # See CheckpointTests for tests with checkpoint data self.ssc = None self.assertEqual(StreamingContext.getActive(), None) def setupFunc(): ssc = StreamingContext(self.sc, self.duration) ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.setupCalled = True return ssc # Verify that getActiveOrCreate() (w/o checkpoint) calls setupFunc when no context is active self.setupCalled = False self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc) self.assertTrue(self.setupCalled) # Verify that getActiveOrCreate() retuns active context and does not call the setupFunc self.ssc.start() self.setupCalled = False self.assertEqual(StreamingContext.getActiveOrCreate(None, setupFunc), self.ssc) self.assertFalse(self.setupCalled) # Verify that getActiveOrCreate() calls setupFunc after active context is stopped self.ssc.stop(False) self.setupCalled = False self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc) self.assertTrue(self.setupCalled) # Verify that if the Java context is stopped, then getActive() returns None self.ssc = StreamingContext(self.sc, self.duration) self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.ssc.start() self.assertEqual(StreamingContext.getActive(), self.ssc) self.ssc._jssc.stop(False) self.setupCalled = False self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc) self.assertTrue(self.setupCalled)
# 6.2.4절 예제 6-12 from pyspark import SparkContext, SparkConf, storagelevel from pyspark.streaming.context import StreamingContext from pyspark.streaming.kafka import KafkaUtils ## pyspark에서 실행할 경우 sparkContext는 생성하지 않습니다! # ./pyspark --packages org.apache.spark:spark-streaming-kafka-0-8-assembly_2.11:2.0.2 conf = SparkConf() sc = SparkContext(master="local[*]", appName="KafkaSample", conf=conf) ssc = StreamingContext(sc, 3) ds1 = KafkaUtils.createStream(ssc, "localhost:2181", "test-consumer-group1", {"test": 3}) ds2 = KafkaUtils.createDirectStream(ssc, ["test"], {"metadata.broker.list": "localhost:9092"}) ds1.pprint() ds2.pprint() ssc.start() ssc.awaitTermination()
class PySparkStreamingTestCase(unittest.TestCase): timeout = 30 # seconds duration = .5 @classmethod def setUpClass(cls): class_name = cls.__name__ conf = SparkConf().set("spark.default.parallelism", 1) cls.sc = SparkContext(appName=class_name, conf=conf) cls.sc.setCheckpointDir(tempfile.mkdtemp()) @classmethod def tearDownClass(cls): cls.sc.stop() # Clean up in the JVM just in case there has been some issues in Python API try: jSparkContextOption = SparkContext._jvm.SparkContext.get() if jSparkContextOption.nonEmpty(): jSparkContextOption.get().stop() except: pass def setUp(self): self.ssc = StreamingContext(self.sc, self.duration) def tearDown(self): if self.ssc is not None: self.ssc.stop(False) # Clean up in the JVM just in case there has been some issues in Python API try: jStreamingContextOption = StreamingContext._jvm.SparkContext.getActive( ) if jStreamingContextOption.nonEmpty(): jStreamingContextOption.get().stop(False) except: pass def wait_for(self, result, n): start_time = time.time() while len(result) < n and time.time() - start_time < self.timeout: time.sleep(0.01) if len(result) < n: print("timeout after", self.timeout) def _take(self, dstream, n): """ Return the first `n` elements in the stream (will start and stop). """ results = [] def take(_, rdd): if rdd and len(results) < n: results.extend(rdd.take(n - len(results))) dstream.foreachRDD(take) self.ssc.start() self.wait_for(results, n) return results def _collect(self, dstream, n, block=True): """ Collect each RDDs into the returned list. :return: list, which will have the collected items. """ result = [] def get_output(_, rdd): if rdd and len(result) < n: r = rdd.collect() if r: result.append(r) dstream.foreachRDD(get_output) if not block: return result self.ssc.start() self.wait_for(result, n) return result def _test_func(self, input, func, expected, sort=False, input2=None): """ @param input: dataset for the test. This should be list of lists. @param func: wrapped function. This function should return PythonDStream object. @param expected: expected output for this testcase. """ if not isinstance(input[0], RDD): input = [self.sc.parallelize(d, 1) for d in input] input_stream = self.ssc.queueStream(input) if input2 and not isinstance(input2[0], RDD): input2 = [self.sc.parallelize(d, 1) for d in input2] input_stream2 = self.ssc.queueStream( input2) if input2 is not None else None # Apply test function to stream. if input2: stream = func(input_stream, input_stream2) else: stream = func(input_stream) result = self._collect(stream, len(expected)) if sort: self._sort_result_based_on_key(result) self._sort_result_based_on_key(expected) self.assertEqual(expected, result) def _sort_result_based_on_key(self, outputs): """Sort the list based on first value.""" for output in outputs: output.sort(key=lambda x: x[0])
class StreamingContextTests(PySparkStreamingTestCase): duration = 0.1 setupCalled = False def _add_input_stream(self): inputs = [range(1, x) for x in range(101)] stream = self.ssc.queueStream(inputs) self._collect(stream, 1, block=False) def test_stop_only_streaming_context(self): self._add_input_stream() self.ssc.start() self.ssc.stop(False) self.assertEqual( len(self.sc.parallelize(range(5), 5).glom().collect()), 5) def test_stop_multiple_times(self): self._add_input_stream() self.ssc.start() self.ssc.stop(False) self.ssc.stop(False) def test_queue_stream(self): input = [list(range(i + 1)) for i in range(3)] dstream = self.ssc.queueStream(input) result = self._collect(dstream, 3) self.assertEqual(input, result) def test_text_file_stream(self): d = tempfile.mkdtemp() self.ssc = StreamingContext(self.sc, self.duration) dstream2 = self.ssc.textFileStream(d).map(int) result = self._collect(dstream2, 2, block=False) self.ssc.start() for name in ('a', 'b'): time.sleep(1) with open(os.path.join(d, name), "w") as f: f.writelines(["%d\n" % i for i in range(10)]) self.wait_for(result, 2) self.assertEqual([list(range(10)), list(range(10))], result) def test_binary_records_stream(self): d = tempfile.mkdtemp() self.ssc = StreamingContext(self.sc, self.duration) dstream = self.ssc.binaryRecordsStream( d, 10).map(lambda v: struct.unpack("10b", bytes(v))) result = self._collect(dstream, 2, block=False) self.ssc.start() for name in ('a', 'b'): time.sleep(1) with open(os.path.join(d, name), "wb") as f: f.write(bytearray(range(10))) self.wait_for(result, 2) self.assertEqual([list(range(10)), list(range(10))], [list(v[0]) for v in result]) def test_union(self): input = [list(range(i + 1)) for i in range(3)] dstream = self.ssc.queueStream(input) dstream2 = self.ssc.queueStream(input) dstream3 = self.ssc.union(dstream, dstream2) result = self._collect(dstream3, 3) expected = [i * 2 for i in input] self.assertEqual(expected, result) def test_transform(self): dstream1 = self.ssc.queueStream([[1]]) dstream2 = self.ssc.queueStream([[2]]) dstream3 = self.ssc.queueStream([[3]]) def func(rdds): rdd1, rdd2, rdd3 = rdds return rdd2.union(rdd3).union(rdd1) dstream = self.ssc.transform([dstream1, dstream2, dstream3], func) self.assertEqual([2, 3, 1], self._take(dstream, 3)) def test_transform_pairrdd(self): # This regression test case is for SPARK-17756. dstream = self.ssc.queueStream( [[1], [2], [3]]).transform(lambda rdd: rdd.cartesian(rdd)) self.assertEqual([(1, 1), (2, 2), (3, 3)], self._take(dstream, 3)) def test_get_active(self): self.assertEqual(StreamingContext.getActive(), None) # Verify that getActive() returns the active context self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.ssc.start() self.assertEqual(StreamingContext.getActive(), self.ssc) # Verify that getActive() returns None self.ssc.stop(False) self.assertEqual(StreamingContext.getActive(), None) # Verify that if the Java context is stopped, then getActive() returns None self.ssc = StreamingContext(self.sc, self.duration) self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.ssc.start() self.assertEqual(StreamingContext.getActive(), self.ssc) self.ssc._jssc.stop(False) self.assertEqual(StreamingContext.getActive(), None) def test_get_active_or_create(self): # Test StreamingContext.getActiveOrCreate() without checkpoint data # See CheckpointTests for tests with checkpoint data self.ssc = None self.assertEqual(StreamingContext.getActive(), None) def setupFunc(): ssc = StreamingContext(self.sc, self.duration) ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.setupCalled = True return ssc # Verify that getActiveOrCreate() (w/o checkpoint) calls setupFunc when no context is active self.setupCalled = False self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc) self.assertTrue(self.setupCalled) # Verify that getActiveOrCreate() returns active context and does not call the setupFunc self.ssc.start() self.setupCalled = False self.assertEqual(StreamingContext.getActiveOrCreate(None, setupFunc), self.ssc) self.assertFalse(self.setupCalled) # Verify that getActiveOrCreate() calls setupFunc after active context is stopped self.ssc.stop(False) self.setupCalled = False self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc) self.assertTrue(self.setupCalled) # Verify that if the Java context is stopped, then getActive() returns None self.ssc = StreamingContext(self.sc, self.duration) self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.ssc.start() self.assertEqual(StreamingContext.getActive(), self.ssc) self.ssc._jssc.stop(False) self.setupCalled = False self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc) self.assertTrue(self.setupCalled) def test_await_termination_or_timeout(self): self._add_input_stream() self.ssc.start() self.assertFalse(self.ssc.awaitTerminationOrTimeout(0.001)) self.ssc.stop(False) self.assertTrue(self.ssc.awaitTerminationOrTimeout(0.001))
def test_get_or_create_and_get_active_or_create(self): inputd = tempfile.mkdtemp() outputd = tempfile.mkdtemp() + "/" def updater(vs, s): return sum(vs, s or 0) def setup(): conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 0.5) dstream = ssc.textFileStream(inputd).map(lambda x: (x, 1)) wc = dstream.updateStateByKey(updater) wc.map(lambda x: "%s,%d" % x).saveAsTextFiles(outputd + "test") wc.checkpoint(.5) self.setupCalled = True return ssc cpd = tempfile.mkdtemp("test_streaming_cps") self.ssc = StreamingContext.getOrCreate(cpd, setup) self.ssc.start() def check_output(n): while not os.listdir(outputd): time.sleep(0.01) time.sleep(1) # make sure mtime is larger than the previous one with open(os.path.join(inputd, str(n)), 'w') as f: f.writelines(["%d\n" % i for i in range(10)]) while True: p = os.path.join(outputd, max(os.listdir(outputd))) if '_SUCCESS' not in os.listdir(p): # not finished time.sleep(0.01) continue ordd = self.ssc.sparkContext.textFile(p).map(lambda line: line.split(",")) d = ordd.values().map(int).collect() if not d: time.sleep(0.01) continue self.assertEqual(10, len(d)) s = set(d) self.assertEqual(1, len(s)) m = s.pop() if n > m: continue self.assertEqual(n, m) break check_output(1) check_output(2) # Verify the getOrCreate() recovers from checkpoint files self.ssc.stop(True, True) time.sleep(1) self.setupCalled = False self.ssc = StreamingContext.getOrCreate(cpd, setup) self.assertFalse(self.setupCalled) self.ssc.start() check_output(3) # Verify the getActiveOrCreate() recovers from checkpoint files self.ssc.stop(True, True) time.sleep(1) self.setupCalled = False self.ssc = StreamingContext.getActiveOrCreate(cpd, setup) self.assertFalse(self.setupCalled) self.ssc.start() check_output(4) # Verify that getActiveOrCreate() returns active context self.setupCalled = False self.assertEquals(StreamingContext.getActiveOrCreate(cpd, setup), self.ssc) self.assertFalse(self.setupCalled) # Verify that getActiveOrCreate() calls setup() in absence of checkpoint files self.ssc.stop(True, True) shutil.rmtree(cpd) # delete checkpoint directory self.setupCalled = False self.ssc = StreamingContext.getActiveOrCreate(cpd, setup) self.assertTrue(self.setupCalled) self.ssc.stop(True, True)
import sys from pyspark.streaming.context import StreamingContext from pyspark.streaming.duration import * if __name__ == "__main__": if len(sys.argv) != 2: print >> sys.stderr, "Usage: wordcount <directory>" exit(-1) ssc = StreamingContext(appName="PythonStreamingWordCount", duration=Seconds(1)) lines = ssc.textFileStream(sys.argv[1]) counts = lines.flatMap(lambda line: line.split(" "))\ .map(lambda x: (x, 1))\ .reduceByKey(lambda a, b: a+b) counts.pyprint() ssc.start() ssc.awaitTermination()
class StreamingContextTests(PySparkStreamingTestCase): duration = 0.1 def _add_input_stream(self): inputs = [range(1, x) for x in range(101)] stream = self.ssc.queueStream(inputs) self._collect(stream, 1, block=False) def test_stop_only_streaming_context(self): self._add_input_stream() self.ssc.start() self.ssc.stop(False) self.assertEqual(len(self.sc.parallelize(range(5), 5).glom().collect()), 5) def test_stop_multiple_times(self): self._add_input_stream() self.ssc.start() self.ssc.stop(False) self.ssc.stop(False) def test_queue_stream(self): input = [list(range(i + 1)) for i in range(3)] dstream = self.ssc.queueStream(input) result = self._collect(dstream, 3) self.assertEqual(input, result) def test_text_file_stream(self): d = tempfile.mkdtemp() self.ssc = StreamingContext(self.sc, self.duration) dstream2 = self.ssc.textFileStream(d).map(int) result = self._collect(dstream2, 2, block=False) self.ssc.start() for name in ("a", "b"): time.sleep(1) with open(os.path.join(d, name), "w") as f: f.writelines(["%d\n" % i for i in range(10)]) self.wait_for(result, 2) self.assertEqual([list(range(10)), list(range(10))], result) def test_binary_records_stream(self): d = tempfile.mkdtemp() self.ssc = StreamingContext(self.sc, self.duration) dstream = self.ssc.binaryRecordsStream(d, 10).map(lambda v: struct.unpack("10b", bytes(v))) result = self._collect(dstream, 2, block=False) self.ssc.start() for name in ("a", "b"): time.sleep(1) with open(os.path.join(d, name), "wb") as f: f.write(bytearray(range(10))) self.wait_for(result, 2) self.assertEqual([list(range(10)), list(range(10))], [list(v[0]) for v in result]) def test_union(self): input = [list(range(i + 1)) for i in range(3)] dstream = self.ssc.queueStream(input) dstream2 = self.ssc.queueStream(input) dstream3 = self.ssc.union(dstream, dstream2) result = self._collect(dstream3, 3) expected = [i * 2 for i in input] self.assertEqual(expected, result) def test_transform(self): dstream1 = self.ssc.queueStream([[1]]) dstream2 = self.ssc.queueStream([[2]]) dstream3 = self.ssc.queueStream([[3]]) def func(rdds): rdd1, rdd2, rdd3 = rdds return rdd2.union(rdd3).union(rdd1) dstream = self.ssc.transform([dstream1, dstream2, dstream3], func) self.assertEqual([2, 3, 1], self._take(dstream, 3))
# 6.1.1절 예제 6-3 from pyspark import SparkContext, SparkConf from pyspark.streaming.context import StreamingContext conf = SparkConf() sc = SparkContext(master="local[*]", appName="SteamingSample", conf=conf) ssc = StreamingContext(sc, 3) rdd1 = sc.parallelize(["Spark Streaming Sample ssc"]) rdd2 = sc.parallelize(["Spark Quque Spark API"]) inputQueue = [rdd1, rdd2] lines = ssc.queueStream(inputQueue, True) words = lines.flatMap(lambda v: v.split(" ")) words.countByValue().pprint() ssc.start() ssc.awaitTermination()
def createStream( ssc: StreamingContext, kinesisAppName: str, streamName: str, endpointUrl: str, regionName: str, initialPositionInStream: str, checkpointInterval: int, metricsLevel: int = MetricsLevel.DETAILED, storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_2, awsAccessKeyId: Optional[str] = None, awsSecretKey: Optional[str] = None, decoder: Union[ Callable[[Optional[bytes]], T], Callable[[Optional[bytes]], Optional[str]] ] = utf8_decoder, stsAssumeRoleArn: Optional[str] = None, stsSessionName: Optional[str] = None, stsExternalId: Optional[str] = None, ) -> Union["DStream[Union[T, Optional[str]]]", "DStream[T]"]: """ Create an input stream that pulls messages from a Kinesis stream. This uses the Kinesis Client Library (KCL) to pull messages from Kinesis. Parameters ---------- ssc : :class:`StreamingContext` StreamingContext object kinesisAppName : str Kinesis application name used by the Kinesis Client Library (KCL) to update DynamoDB streamName : str Kinesis stream name endpointUrl : str Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com) regionName : str Name of region used by the Kinesis Client Library (KCL) to update DynamoDB (lease coordination and checkpointing) and CloudWatch (metrics) initialPositionInStream : int In the absence of Kinesis checkpoint info, this is the worker's initial starting position in the stream. The values are either the beginning of the stream per Kinesis' limit of 24 hours (InitialPositionInStream.TRIM_HORIZON) or the tip of the stream (InitialPositionInStream.LATEST). checkpointInterval : int Checkpoint interval(in seconds) for Kinesis checkpointing. See the Kinesis Spark Streaming documentation for more details on the different types of checkpoints. metricsLevel : int Level of CloudWatch PutMetrics. Can be set to either DETAILED, SUMMARY, or NONE. (default is DETAILED) storageLevel : :class:`pyspark.StorageLevel`, optional Storage level to use for storing the received objects (default is StorageLevel.MEMORY_AND_DISK_2) awsAccessKeyId : str, optional AWS AccessKeyId (default is None. If None, will use DefaultAWSCredentialsProviderChain) awsSecretKey : str, optional AWS SecretKey (default is None. If None, will use DefaultAWSCredentialsProviderChain) decoder : function, optional A function used to decode value (default is utf8_decoder) stsAssumeRoleArn : str, optional ARN of IAM role to assume when using STS sessions to read from the Kinesis stream (default is None). stsSessionName : str, optional Name to uniquely identify STS sessions used to read from Kinesis stream, if STS is being used (default is None). stsExternalId : str, optional External ID that can be used to validate against the assumed IAM role's trust policy, if STS is being used (default is None). Returns ------- A DStream object Notes ----- The given AWS credentials will get saved in DStream checkpoints if checkpointing is enabled. Make sure that your checkpoint directory is secure. """ jlevel = ssc._sc._getJavaStorageLevel(storageLevel) jduration = ssc._jduration(checkpointInterval) jvm = ssc._jvm assert jvm is not None try: helper = jvm.org.apache.spark.streaming.kinesis.KinesisUtilsPythonHelper() except TypeError as e: if str(e) == "'JavaPackage' object is not callable": _print_missing_jar( "Streaming's Kinesis", "streaming-kinesis-asl", "streaming-kinesis-asl-assembly", ssc.sparkContext.version, ) raise jstream = helper.createStream( ssc._jssc, kinesisAppName, streamName, endpointUrl, regionName, initialPositionInStream, jduration, metricsLevel, jlevel, awsAccessKeyId, awsSecretKey, stsAssumeRoleArn, stsSessionName, stsExternalId, ) stream: DStream = DStream(jstream, ssc, NoOpSerializer()) return stream.map(lambda v: decoder(v))
# 6.2.3절 from pyspark import SparkContext, SparkConf from pyspark.streaming.context import StreamingContext conf = SparkConf() sc = SparkContext(master="local[*]", appName="QueueSample", conf=conf) ssc = StreamingContext(sc, 3) rdd1 = sc.parallelize(["a", "b", "c"]) rdd2 = sc.parallelize(["c", "d", "e"]) queue = [rdd1, rdd2] ds = ssc.queueStream(queue) ds.pprint() ssc.start() ssc.awaitTermination()
def test_get_or_create_and_get_active_or_create(self): inputd = tempfile.mkdtemp() outputd = tempfile.mkdtemp() + "/" def updater(vs, s): return sum(vs, s or 0) def setup(): conf = SparkConf().set("spark.default.parallelism", 1) sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 2) dstream = ssc.textFileStream(inputd).map(lambda x: (x, 1)) wc = dstream.updateStateByKey(updater) wc.map(lambda x: "%s,%d" % x).saveAsTextFiles(outputd + "test") wc.checkpoint(2) self.setupCalled = True return ssc # Verify that getOrCreate() calls setup() in absence of checkpoint files self.cpd = tempfile.mkdtemp("test_streaming_cps") self.setupCalled = False self.ssc = StreamingContext.getOrCreate(self.cpd, setup) self.assertTrue(self.setupCalled) self.ssc.start() def check_output(n): while not os.listdir(outputd): if self.ssc.awaitTerminationOrTimeout(0.5): raise Exception("ssc stopped") time.sleep(1) # make sure mtime is larger than the previous one with open(os.path.join(inputd, str(n)), 'w') as f: f.writelines(["%d\n" % i for i in range(10)]) while True: if self.ssc.awaitTerminationOrTimeout(0.5): raise Exception("ssc stopped") p = os.path.join(outputd, max(os.listdir(outputd))) if '_SUCCESS' not in os.listdir(p): # not finished continue ordd = self.ssc.sparkContext.textFile(p).map( lambda line: line.split(",")) d = ordd.values().map(int).collect() if not d: continue self.assertEqual(10, len(d)) s = set(d) self.assertEqual(1, len(s)) m = s.pop() if n > m: continue self.assertEqual(n, m) break check_output(1) check_output(2) # Verify the getOrCreate() recovers from checkpoint files self.ssc.stop(True, True) time.sleep(1) self.setupCalled = False self.ssc = StreamingContext.getOrCreate(self.cpd, setup) self.assertFalse(self.setupCalled) self.ssc.start() check_output(3) # Verify that getOrCreate() uses existing SparkContext self.ssc.stop(True, True) time.sleep(1) self.sc = SparkContext(conf=SparkConf()) self.setupCalled = False self.ssc = StreamingContext.getOrCreate(self.cpd, setup) self.assertFalse(self.setupCalled) self.assertTrue(self.ssc.sparkContext == self.sc) # Verify the getActiveOrCreate() recovers from checkpoint files self.ssc.stop(True, True) time.sleep(1) self.setupCalled = False self.ssc = StreamingContext.getActiveOrCreate(self.cpd, setup) self.assertFalse(self.setupCalled) self.ssc.start() check_output(4) # Verify that getActiveOrCreate() returns active context self.setupCalled = False self.assertEqual(StreamingContext.getActiveOrCreate(self.cpd, setup), self.ssc) self.assertFalse(self.setupCalled) # Verify that getActiveOrCreate() uses existing SparkContext self.ssc.stop(True, True) time.sleep(1) self.sc = SparkContext(conf=SparkConf()) self.setupCalled = False self.ssc = StreamingContext.getActiveOrCreate(self.cpd, setup) self.assertFalse(self.setupCalled) self.assertTrue(self.ssc.sparkContext == self.sc) # Verify that getActiveOrCreate() calls setup() in absence of checkpoint files self.ssc.stop(True, True) shutil.rmtree(self.cpd) # delete checkpoint directory time.sleep(1) self.setupCalled = False self.ssc = StreamingContext.getActiveOrCreate(self.cpd, setup) self.assertTrue(self.setupCalled) # Stop everything self.ssc.stop(True, True)
from pyspark import SparkContext, SparkConf from pyspark.streaming.context import StreamingContext checkpointDir = "./checkPoints/StreamingOps/Scala" conf = SparkConf() sc = SparkContext(master="local[*]", appName="StreamingOps", conf=conf) ssc = StreamingContext(sc, 1) rdd1 = ssc.sparkContext.parallelize(["a", "b", "c", "c", "c"]) rdd2 = ssc.sparkContext.parallelize(["1,2,3,4,5"]) rdd3 = ssc.sparkContext.parallelize([("k1", "r1"), ("k2", "r2"), ("k3", "r3")]) rdd4 = ssc.sparkContext.parallelize([("k1", "s1"), ("k2", "s2")]) rdd5 = ssc.sparkContext.range(1, 6) q1 = [rdd1] q2 = [rdd2] q3 = [rdd3] q4 = [rdd4] q5 = [rdd5] ds1 = ssc.queueStream(q1, True) ds2 = ssc.queueStream(q2, True) ds3 = ssc.queueStream(q3, True) ds4 = ssc.queueStream(q4, True) ds5 = ssc.queueStream(q5, True) # [예제 실행 방법] 아래에서 원하는 예제의 주석을 제거하고 실행!! # 6.3.1절(파이썬의 경우 print가 아닌 pprint임) # ds1.pprint()