class StreamingContextTests(PySparkStreamingTestCase): duration = 0.1 setupCalled = False def _add_input_stream(self): inputs = [range(1, x) for x in range(101)] stream = self.ssc.queueStream(inputs) self._collect(stream, 1, block=False) def test_stop_only_streaming_context(self): self._add_input_stream() self.ssc.start() self.ssc.stop(False) self.assertEqual( len(self.sc.parallelize(range(5), 5).glom().collect()), 5) def test_stop_multiple_times(self): self._add_input_stream() self.ssc.start() self.ssc.stop(False) self.ssc.stop(False) def test_queue_stream(self): input = [list(range(i + 1)) for i in range(3)] dstream = self.ssc.queueStream(input) result = self._collect(dstream, 3) self.assertEqual(input, result) def test_text_file_stream(self): d = tempfile.mkdtemp() self.ssc = StreamingContext(self.sc, self.duration) dstream2 = self.ssc.textFileStream(d).map(int) result = self._collect(dstream2, 2, block=False) self.ssc.start() for name in ('a', 'b'): time.sleep(1) with open(os.path.join(d, name), "w") as f: f.writelines(["%d\n" % i for i in range(10)]) self.wait_for(result, 2) self.assertEqual([list(range(10)), list(range(10))], result) def test_binary_records_stream(self): d = tempfile.mkdtemp() self.ssc = StreamingContext(self.sc, self.duration) dstream = self.ssc.binaryRecordsStream( d, 10).map(lambda v: struct.unpack("10b", bytes(v))) result = self._collect(dstream, 2, block=False) self.ssc.start() for name in ('a', 'b'): time.sleep(1) with open(os.path.join(d, name), "wb") as f: f.write(bytearray(range(10))) self.wait_for(result, 2) self.assertEqual([list(range(10)), list(range(10))], [list(v[0]) for v in result]) def test_union(self): input = [list(range(i + 1)) for i in range(3)] dstream = self.ssc.queueStream(input) dstream2 = self.ssc.queueStream(input) dstream3 = self.ssc.union(dstream, dstream2) result = self._collect(dstream3, 3) expected = [i * 2 for i in input] self.assertEqual(expected, result) def test_transform(self): dstream1 = self.ssc.queueStream([[1]]) dstream2 = self.ssc.queueStream([[2]]) dstream3 = self.ssc.queueStream([[3]]) def func(rdds): rdd1, rdd2, rdd3 = rdds return rdd2.union(rdd3).union(rdd1) dstream = self.ssc.transform([dstream1, dstream2, dstream3], func) self.assertEqual([2, 3, 1], self._take(dstream, 3)) def test_transform_pairrdd(self): # This regression test case is for SPARK-17756. dstream = self.ssc.queueStream( [[1], [2], [3]]).transform(lambda rdd: rdd.cartesian(rdd)) self.assertEqual([(1, 1), (2, 2), (3, 3)], self._take(dstream, 3)) def test_get_active(self): self.assertEqual(StreamingContext.getActive(), None) # Verify that getActive() returns the active context self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.ssc.start() self.assertEqual(StreamingContext.getActive(), self.ssc) # Verify that getActive() returns None self.ssc.stop(False) self.assertEqual(StreamingContext.getActive(), None) # Verify that if the Java context is stopped, then getActive() returns None self.ssc = StreamingContext(self.sc, self.duration) self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.ssc.start() self.assertEqual(StreamingContext.getActive(), self.ssc) self.ssc._jssc.stop(False) self.assertEqual(StreamingContext.getActive(), None) def test_get_active_or_create(self): # Test StreamingContext.getActiveOrCreate() without checkpoint data # See CheckpointTests for tests with checkpoint data self.ssc = None self.assertEqual(StreamingContext.getActive(), None) def setupFunc(): ssc = StreamingContext(self.sc, self.duration) ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.setupCalled = True return ssc # Verify that getActiveOrCreate() (w/o checkpoint) calls setupFunc when no context is active self.setupCalled = False self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc) self.assertTrue(self.setupCalled) # Verify that getActiveOrCreate() returns active context and does not call the setupFunc self.ssc.start() self.setupCalled = False self.assertEqual(StreamingContext.getActiveOrCreate(None, setupFunc), self.ssc) self.assertFalse(self.setupCalled) # Verify that getActiveOrCreate() calls setupFunc after active context is stopped self.ssc.stop(False) self.setupCalled = False self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc) self.assertTrue(self.setupCalled) # Verify that if the Java context is stopped, then getActive() returns None self.ssc = StreamingContext(self.sc, self.duration) self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.ssc.start() self.assertEqual(StreamingContext.getActive(), self.ssc) self.ssc._jssc.stop(False) self.setupCalled = False self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc) self.assertTrue(self.setupCalled) def test_await_termination_or_timeout(self): self._add_input_stream() self.ssc.start() self.assertFalse(self.ssc.awaitTerminationOrTimeout(0.001)) self.ssc.stop(False) self.assertTrue(self.ssc.awaitTerminationOrTimeout(0.001))
class StreamingContextTests(PySparkStreamingTestCase): duration = 0.1 setupCalled = False def _add_input_stream(self): inputs = [range(1, x) for x in range(101)] stream = self.ssc.queueStream(inputs) self._collect(stream, 1, block=False) def test_stop_only_streaming_context(self): self._add_input_stream() self.ssc.start() self.ssc.stop(False) self.assertEqual(len(self.sc.parallelize(range(5), 5).glom().collect()), 5) def test_stop_multiple_times(self): self._add_input_stream() self.ssc.start() self.ssc.stop(False) self.ssc.stop(False) def test_queue_stream(self): input = [list(range(i + 1)) for i in range(3)] dstream = self.ssc.queueStream(input) result = self._collect(dstream, 3) self.assertEqual(input, result) def test_text_file_stream(self): d = tempfile.mkdtemp() self.ssc = StreamingContext(self.sc, self.duration) dstream2 = self.ssc.textFileStream(d).map(int) result = self._collect(dstream2, 2, block=False) self.ssc.start() for name in ('a', 'b'): time.sleep(1) with open(os.path.join(d, name), "w") as f: f.writelines(["%d\n" % i for i in range(10)]) self.wait_for(result, 2) self.assertEqual([list(range(10)), list(range(10))], result) def test_binary_records_stream(self): d = tempfile.mkdtemp() self.ssc = StreamingContext(self.sc, self.duration) dstream = self.ssc.binaryRecordsStream(d, 10).map( lambda v: struct.unpack("10b", bytes(v))) result = self._collect(dstream, 2, block=False) self.ssc.start() for name in ('a', 'b'): time.sleep(1) with open(os.path.join(d, name), "wb") as f: f.write(bytearray(range(10))) self.wait_for(result, 2) self.assertEqual([list(range(10)), list(range(10))], [list(v[0]) for v in result]) def test_union(self): input = [list(range(i + 1)) for i in range(3)] dstream = self.ssc.queueStream(input) dstream2 = self.ssc.queueStream(input) dstream3 = self.ssc.union(dstream, dstream2) result = self._collect(dstream3, 3) expected = [i * 2 for i in input] self.assertEqual(expected, result) def test_transform(self): dstream1 = self.ssc.queueStream([[1]]) dstream2 = self.ssc.queueStream([[2]]) dstream3 = self.ssc.queueStream([[3]]) def func(rdds): rdd1, rdd2, rdd3 = rdds return rdd2.union(rdd3).union(rdd1) dstream = self.ssc.transform([dstream1, dstream2, dstream3], func) self.assertEqual([2, 3, 1], self._take(dstream, 3)) def test_transform_pairrdd(self): # This regression test case is for SPARK-17756. dstream = self.ssc.queueStream( [[1], [2], [3]]).transform(lambda rdd: rdd.cartesian(rdd)) self.assertEqual([(1, 1), (2, 2), (3, 3)], self._take(dstream, 3)) def test_get_active(self): self.assertEqual(StreamingContext.getActive(), None) # Verify that getActive() returns the active context self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.ssc.start() self.assertEqual(StreamingContext.getActive(), self.ssc) # Verify that getActive() returns None self.ssc.stop(False) self.assertEqual(StreamingContext.getActive(), None) # Verify that if the Java context is stopped, then getActive() returns None self.ssc = StreamingContext(self.sc, self.duration) self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.ssc.start() self.assertEqual(StreamingContext.getActive(), self.ssc) self.ssc._jssc.stop(False) self.assertEqual(StreamingContext.getActive(), None) def test_get_active_or_create(self): # Test StreamingContext.getActiveOrCreate() without checkpoint data # See CheckpointTests for tests with checkpoint data self.ssc = None self.assertEqual(StreamingContext.getActive(), None) def setupFunc(): ssc = StreamingContext(self.sc, self.duration) ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.setupCalled = True return ssc # Verify that getActiveOrCreate() (w/o checkpoint) calls setupFunc when no context is active self.setupCalled = False self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc) self.assertTrue(self.setupCalled) # Verify that getActiveOrCreate() returns active context and does not call the setupFunc self.ssc.start() self.setupCalled = False self.assertEqual(StreamingContext.getActiveOrCreate(None, setupFunc), self.ssc) self.assertFalse(self.setupCalled) # Verify that getActiveOrCreate() calls setupFunc after active context is stopped self.ssc.stop(False) self.setupCalled = False self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc) self.assertTrue(self.setupCalled) # Verify that if the Java context is stopped, then getActive() returns None self.ssc = StreamingContext(self.sc, self.duration) self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.ssc.start() self.assertEqual(StreamingContext.getActive(), self.ssc) self.ssc._jssc.stop(False) self.setupCalled = False self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc) self.assertTrue(self.setupCalled) def test_await_termination_or_timeout(self): self._add_input_stream() self.ssc.start() self.assertFalse(self.ssc.awaitTerminationOrTimeout(0.001)) self.ssc.stop(False) self.assertTrue(self.ssc.awaitTerminationOrTimeout(0.001))
class BasicOperationTests(PySparkStreamingTestCase): def test_map(self): """Basic operation test for DStream.map.""" input = [range(1, 5), range(5, 9), range(9, 13)] def func(dstream): return dstream.map(str) expected = [list(map(str, x)) for x in input] self._test_func(input, func, expected) def test_flatMap(self): """Basic operation test for DStream.flatMap.""" input = [range(1, 5), range(5, 9), range(9, 13)] def func(dstream): return dstream.flatMap(lambda x: (x, x * 2)) expected = [ list(chain.from_iterable((map(lambda y: [y, y * 2], x)))) for x in input ] self._test_func(input, func, expected) def test_filter(self): """Basic operation test for DStream.filter.""" input = [range(1, 5), range(5, 9), range(9, 13)] def func(dstream): return dstream.filter(lambda x: x % 2 == 0) expected = [[y for y in x if y % 2 == 0] for x in input] self._test_func(input, func, expected) def test_count(self): """Basic operation test for DStream.count.""" input = [range(5), range(10), range(20)] def func(dstream): return dstream.count() expected = [[len(x)] for x in input] self._test_func(input, func, expected) def test_slice(self): """Basic operation test for DStream.slice.""" import datetime as dt self.ssc = StreamingContext(self.sc, 1.0) self.ssc.remember(4.0) input = [[1], [2], [3], [4]] stream = self.ssc.queueStream( [self.sc.parallelize(d, 1) for d in input]) time_vals = [] def get_times(t, rdd): if rdd and len(time_vals) < len(input): time_vals.append(t) stream.foreachRDD(get_times) self.ssc.start() self.wait_for(time_vals, 4) begin_time = time_vals[0] def get_sliced(begin_delta, end_delta): begin = begin_time + dt.timedelta(seconds=begin_delta) end = begin_time + dt.timedelta(seconds=end_delta) rdds = stream.slice(begin, end) result_list = [rdd.collect() for rdd in rdds] return [r for result in result_list for r in result] self.assertEqual(set([1]), set(get_sliced(0, 0))) self.assertEqual(set([2, 3]), set(get_sliced(1, 2))) self.assertEqual(set([2, 3, 4]), set(get_sliced(1, 4))) self.assertEqual(set([1, 2, 3, 4]), set(get_sliced(0, 4))) def test_reduce(self): """Basic operation test for DStream.reduce.""" input = [range(1, 5), range(5, 9), range(9, 13)] def func(dstream): return dstream.reduce(operator.add) expected = [[reduce(operator.add, x)] for x in input] self._test_func(input, func, expected) def test_reduceByKey(self): """Basic operation test for DStream.reduceByKey.""" input = [[("a", 1), ("a", 1), ("b", 1), ("b", 1)], [("", 1), ("", 1), ("", 1), ("", 1)], [(1, 1), (1, 1), (2, 1), (2, 1), (3, 1)]] def func(dstream): return dstream.reduceByKey(operator.add) expected = [[("a", 2), ("b", 2)], [("", 4)], [(1, 2), (2, 2), (3, 1)]] self._test_func(input, func, expected, sort=True) def test_mapValues(self): """Basic operation test for DStream.mapValues.""" input = [[("a", 2), ("b", 2), ("c", 1), ("d", 1)], [(0, 4), (1, 1), (2, 2), (3, 3)], [(1, 1), (2, 1), (3, 1), (4, 1)]] def func(dstream): return dstream.mapValues(lambda x: x + 10) expected = [[("a", 12), ("b", 12), ("c", 11), ("d", 11)], [(0, 14), (1, 11), (2, 12), (3, 13)], [(1, 11), (2, 11), (3, 11), (4, 11)]] self._test_func(input, func, expected, sort=True) def test_flatMapValues(self): """Basic operation test for DStream.flatMapValues.""" input = [[("a", 2), ("b", 2), ("c", 1), ("d", 1)], [(0, 4), (1, 1), (2, 1), (3, 1)], [(1, 1), (2, 1), (3, 1), (4, 1)]] def func(dstream): return dstream.flatMapValues(lambda x: (x, x + 10)) expected = [[("a", 2), ("a", 12), ("b", 2), ("b", 12), ("c", 1), ("c", 11), ("d", 1), ("d", 11)], [(0, 4), (0, 14), (1, 1), (1, 11), (2, 1), (2, 11), (3, 1), (3, 11)], [(1, 1), (1, 11), (2, 1), (2, 11), (3, 1), (3, 11), (4, 1), (4, 11)]] self._test_func(input, func, expected) def test_glom(self): """Basic operation test for DStream.glom.""" input = [range(1, 5), range(5, 9), range(9, 13)] rdds = [self.sc.parallelize(r, 2) for r in input] def func(dstream): return dstream.glom() expected = [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[9, 10], [11, 12]]] self._test_func(rdds, func, expected) def test_mapPartitions(self): """Basic operation test for DStream.mapPartitions.""" input = [range(1, 5), range(5, 9), range(9, 13)] rdds = [self.sc.parallelize(r, 2) for r in input] def func(dstream): def f(iterator): yield sum(iterator) return dstream.mapPartitions(f) expected = [[3, 7], [11, 15], [19, 23]] self._test_func(rdds, func, expected) def test_countByValue(self): """Basic operation test for DStream.countByValue.""" input = [ list(range(1, 5)) * 2, list(range(5, 7)) + list(range(5, 9)), ["a", "a", "b", ""] ] def func(dstream): return dstream.countByValue() expected = [[(1, 2), (2, 2), (3, 2), (4, 2)], [(5, 2), (6, 2), (7, 1), (8, 1)], [("a", 2), ("b", 1), ("", 1)]] self._test_func(input, func, expected, sort=True) def test_groupByKey(self): """Basic operation test for DStream.groupByKey.""" input = [[(1, 1), (2, 1), (3, 1), (4, 1)], [(1, 1), (1, 1), (1, 1), (2, 1), (2, 1), (3, 1)], [("a", 1), ("a", 1), ("b", 1), ("", 1), ("", 1), ("", 1)]] def func(dstream): return dstream.groupByKey().mapValues(list) expected = [[(1, [1]), (2, [1]), (3, [1]), (4, [1])], [(1, [1, 1, 1]), (2, [1, 1]), (3, [1])], [("a", [1, 1]), ("b", [1]), ("", [1, 1, 1])]] self._test_func(input, func, expected, sort=True) def test_combineByKey(self): """Basic operation test for DStream.combineByKey.""" input = [[(1, 1), (2, 1), (3, 1), (4, 1)], [(1, 1), (1, 1), (1, 1), (2, 1), (2, 1), (3, 1)], [("a", 1), ("a", 1), ("b", 1), ("", 1), ("", 1), ("", 1)]] def func(dstream): def add(a, b): return a + str(b) return dstream.combineByKey(str, add, add) expected = [[(1, "1"), (2, "1"), (3, "1"), (4, "1")], [(1, "111"), (2, "11"), (3, "1")], [("a", "11"), ("b", "1"), ("", "111")]] self._test_func(input, func, expected, sort=True) def test_repartition(self): input = [range(1, 5), range(5, 9)] rdds = [self.sc.parallelize(r, 2) for r in input] def func(dstream): return dstream.repartition(1).glom() expected = [[[1, 2, 3, 4]], [[5, 6, 7, 8]]] self._test_func(rdds, func, expected) def test_union(self): input1 = [range(3), range(5), range(6)] input2 = [range(3, 6), range(5, 6)] def func(d1, d2): return d1.union(d2) expected = [list(range(6)), list(range(6)), list(range(6))] self._test_func(input1, func, expected, input2=input2) def test_cogroup(self): input = [[(1, 1), (2, 1), (3, 1)], [(1, 1), (1, 1), (1, 1), (2, 1)], [("a", 1), ("a", 1), ("b", 1), ("", 1), ("", 1)]] input2 = [[(1, 2)], [(4, 1)], [("a", 1), ("a", 1), ("b", 1), ("", 1), ("", 2)]] def func(d1, d2): return d1.cogroup(d2).mapValues(lambda vs: tuple(map(list, vs))) expected = [[(1, ([1], [2])), (2, ([1], [])), (3, ([1], []))], [(1, ([1, 1, 1], [])), (2, ([1], [])), (4, ([], [1]))], [("a", ([1, 1], [1, 1])), ("b", ([1], [1])), ("", ([1, 1], [1, 2]))]] self._test_func(input, func, expected, sort=True, input2=input2) def test_join(self): input = [[('a', 1), ('b', 2)]] input2 = [[('b', 3), ('c', 4)]] def func(a, b): return a.join(b) expected = [[('b', (2, 3))]] self._test_func(input, func, expected, True, input2) def test_left_outer_join(self): input = [[('a', 1), ('b', 2)]] input2 = [[('b', 3), ('c', 4)]] def func(a, b): return a.leftOuterJoin(b) expected = [[('a', (1, None)), ('b', (2, 3))]] self._test_func(input, func, expected, True, input2) def test_right_outer_join(self): input = [[('a', 1), ('b', 2)]] input2 = [[('b', 3), ('c', 4)]] def func(a, b): return a.rightOuterJoin(b) expected = [[('b', (2, 3)), ('c', (None, 4))]] self._test_func(input, func, expected, True, input2) def test_full_outer_join(self): input = [[('a', 1), ('b', 2)]] input2 = [[('b', 3), ('c', 4)]] def func(a, b): return a.fullOuterJoin(b) expected = [[('a', (1, None)), ('b', (2, 3)), ('c', (None, 4))]] self._test_func(input, func, expected, True, input2) def test_update_state_by_key(self): def updater(vs, s): if not s: s = [] s.extend(vs) return s input = [[('k', i)] for i in range(5)] def func(dstream): return dstream.updateStateByKey(updater) expected = [[0], [0, 1], [0, 1, 2], [0, 1, 2, 3], [0, 1, 2, 3, 4]] expected = [[('k', v)] for v in expected] self._test_func(input, func, expected) def test_update_state_by_key_initial_rdd(self): def updater(vs, s): if not s: s = [] s.extend(vs) return s initial = [('k', [0, 1])] initial = self.sc.parallelize(initial, 1) input = [[('k', i)] for i in range(2, 5)] def func(dstream): return dstream.updateStateByKey(updater, initialRDD=initial) expected = [[0, 1, 2], [0, 1, 2, 3], [0, 1, 2, 3, 4]] expected = [[('k', v)] for v in expected] self._test_func(input, func, expected) def test_failed_func(self): # Test failure in # TransformFunction.apply(rdd: Option[RDD[_]], time: Time) input = [self.sc.parallelize([d], 1) for d in range(4)] input_stream = self.ssc.queueStream(input) def failed_func(i): raise ValueError("This is a special error") input_stream.map(failed_func).pprint() self.ssc.start() try: self.ssc.awaitTerminationOrTimeout(10) except: import traceback failure = traceback.format_exc() self.assertTrue("This is a special error" in failure) return self.fail("a failed func should throw an error") def test_failed_func2(self): # Test failure in # TransformFunction.apply(rdd: Option[RDD[_]], rdd2: Option[RDD[_]], time: Time) input = [self.sc.parallelize([d], 1) for d in range(4)] input_stream1 = self.ssc.queueStream(input) input_stream2 = self.ssc.queueStream(input) def failed_func(rdd1, rdd2): raise ValueError("This is a special error") input_stream1.transformWith(failed_func, input_stream2, True).pprint() self.ssc.start() try: self.ssc.awaitTerminationOrTimeout(10) except: import traceback failure = traceback.format_exc() self.assertTrue("This is a special error" in failure) return self.fail("a failed func should throw an error") def test_failed_func_with_reseting_failure(self): input = [self.sc.parallelize([d], 1) for d in range(4)] input_stream = self.ssc.queueStream(input) def failed_func(i): if i == 1: # Make it fail in the second batch raise ValueError("This is a special error") else: return i # We should be able to see the results of the 3rd and 4th batches even if the second batch # fails expected = [[0], [2], [3]] self.assertEqual(expected, self._collect(input_stream.map(failed_func), 3)) try: self.ssc.awaitTerminationOrTimeout(10) except: import traceback failure = traceback.format_exc() self.assertTrue("This is a special error" in failure) return self.fail("a failed func should throw an error")
class BasicOperationTests(PySparkStreamingTestCase): def test_map(self): """Basic operation test for DStream.map.""" input = [range(1, 5), range(5, 9), range(9, 13)] def func(dstream): return dstream.map(str) expected = [list(map(str, x)) for x in input] self._test_func(input, func, expected) def test_flatMap(self): """Basic operation test for DStream.flatMap.""" input = [range(1, 5), range(5, 9), range(9, 13)] def func(dstream): return dstream.flatMap(lambda x: (x, x * 2)) expected = [list(chain.from_iterable((map(lambda y: [y, y * 2], x)))) for x in input] self._test_func(input, func, expected) def test_filter(self): """Basic operation test for DStream.filter.""" input = [range(1, 5), range(5, 9), range(9, 13)] def func(dstream): return dstream.filter(lambda x: x % 2 == 0) expected = [[y for y in x if y % 2 == 0] for x in input] self._test_func(input, func, expected) def test_count(self): """Basic operation test for DStream.count.""" input = [range(5), range(10), range(20)] def func(dstream): return dstream.count() expected = [[len(x)] for x in input] self._test_func(input, func, expected) def test_slice(self): """Basic operation test for DStream.slice.""" import datetime as dt self.ssc = StreamingContext(self.sc, 1.0) self.ssc.remember(4.0) input = [[1], [2], [3], [4]] stream = self.ssc.queueStream([self.sc.parallelize(d, 1) for d in input]) time_vals = [] def get_times(t, rdd): if rdd and len(time_vals) < len(input): time_vals.append(t) stream.foreachRDD(get_times) self.ssc.start() self.wait_for(time_vals, 4) begin_time = time_vals[0] def get_sliced(begin_delta, end_delta): begin = begin_time + dt.timedelta(seconds=begin_delta) end = begin_time + dt.timedelta(seconds=end_delta) rdds = stream.slice(begin, end) result_list = [rdd.collect() for rdd in rdds] return [r for result in result_list for r in result] self.assertEqual(set([1]), set(get_sliced(0, 0))) self.assertEqual(set([2, 3]), set(get_sliced(1, 2))) self.assertEqual(set([2, 3, 4]), set(get_sliced(1, 4))) self.assertEqual(set([1, 2, 3, 4]), set(get_sliced(0, 4))) def test_reduce(self): """Basic operation test for DStream.reduce.""" input = [range(1, 5), range(5, 9), range(9, 13)] def func(dstream): return dstream.reduce(operator.add) expected = [[reduce(operator.add, x)] for x in input] self._test_func(input, func, expected) def test_reduceByKey(self): """Basic operation test for DStream.reduceByKey.""" input = [[("a", 1), ("a", 1), ("b", 1), ("b", 1)], [("", 1), ("", 1), ("", 1), ("", 1)], [(1, 1), (1, 1), (2, 1), (2, 1), (3, 1)]] def func(dstream): return dstream.reduceByKey(operator.add) expected = [[("a", 2), ("b", 2)], [("", 4)], [(1, 2), (2, 2), (3, 1)]] self._test_func(input, func, expected, sort=True) def test_mapValues(self): """Basic operation test for DStream.mapValues.""" input = [[("a", 2), ("b", 2), ("c", 1), ("d", 1)], [(0, 4), (1, 1), (2, 2), (3, 3)], [(1, 1), (2, 1), (3, 1), (4, 1)]] def func(dstream): return dstream.mapValues(lambda x: x + 10) expected = [[("a", 12), ("b", 12), ("c", 11), ("d", 11)], [(0, 14), (1, 11), (2, 12), (3, 13)], [(1, 11), (2, 11), (3, 11), (4, 11)]] self._test_func(input, func, expected, sort=True) def test_flatMapValues(self): """Basic operation test for DStream.flatMapValues.""" input = [[("a", 2), ("b", 2), ("c", 1), ("d", 1)], [(0, 4), (1, 1), (2, 1), (3, 1)], [(1, 1), (2, 1), (3, 1), (4, 1)]] def func(dstream): return dstream.flatMapValues(lambda x: (x, x + 10)) expected = [[("a", 2), ("a", 12), ("b", 2), ("b", 12), ("c", 1), ("c", 11), ("d", 1), ("d", 11)], [(0, 4), (0, 14), (1, 1), (1, 11), (2, 1), (2, 11), (3, 1), (3, 11)], [(1, 1), (1, 11), (2, 1), (2, 11), (3, 1), (3, 11), (4, 1), (4, 11)]] self._test_func(input, func, expected) def test_glom(self): """Basic operation test for DStream.glom.""" input = [range(1, 5), range(5, 9), range(9, 13)] rdds = [self.sc.parallelize(r, 2) for r in input] def func(dstream): return dstream.glom() expected = [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[9, 10], [11, 12]]] self._test_func(rdds, func, expected) def test_mapPartitions(self): """Basic operation test for DStream.mapPartitions.""" input = [range(1, 5), range(5, 9), range(9, 13)] rdds = [self.sc.parallelize(r, 2) for r in input] def func(dstream): def f(iterator): yield sum(iterator) return dstream.mapPartitions(f) expected = [[3, 7], [11, 15], [19, 23]] self._test_func(rdds, func, expected) def test_countByValue(self): """Basic operation test for DStream.countByValue.""" input = [list(range(1, 5)) * 2, list(range(5, 7)) + list(range(5, 9)), ["a", "a", "b", ""]] def func(dstream): return dstream.countByValue() expected = [[(1, 2), (2, 2), (3, 2), (4, 2)], [(5, 2), (6, 2), (7, 1), (8, 1)], [("a", 2), ("b", 1), ("", 1)]] self._test_func(input, func, expected, sort=True) def test_groupByKey(self): """Basic operation test for DStream.groupByKey.""" input = [[(1, 1), (2, 1), (3, 1), (4, 1)], [(1, 1), (1, 1), (1, 1), (2, 1), (2, 1), (3, 1)], [("a", 1), ("a", 1), ("b", 1), ("", 1), ("", 1), ("", 1)]] def func(dstream): return dstream.groupByKey().mapValues(list) expected = [[(1, [1]), (2, [1]), (3, [1]), (4, [1])], [(1, [1, 1, 1]), (2, [1, 1]), (3, [1])], [("a", [1, 1]), ("b", [1]), ("", [1, 1, 1])]] self._test_func(input, func, expected, sort=True) def test_combineByKey(self): """Basic operation test for DStream.combineByKey.""" input = [[(1, 1), (2, 1), (3, 1), (4, 1)], [(1, 1), (1, 1), (1, 1), (2, 1), (2, 1), (3, 1)], [("a", 1), ("a", 1), ("b", 1), ("", 1), ("", 1), ("", 1)]] def func(dstream): def add(a, b): return a + str(b) return dstream.combineByKey(str, add, add) expected = [[(1, "1"), (2, "1"), (3, "1"), (4, "1")], [(1, "111"), (2, "11"), (3, "1")], [("a", "11"), ("b", "1"), ("", "111")]] self._test_func(input, func, expected, sort=True) def test_repartition(self): input = [range(1, 5), range(5, 9)] rdds = [self.sc.parallelize(r, 2) for r in input] def func(dstream): return dstream.repartition(1).glom() expected = [[[1, 2, 3, 4]], [[5, 6, 7, 8]]] self._test_func(rdds, func, expected) def test_union(self): input1 = [range(3), range(5), range(6)] input2 = [range(3, 6), range(5, 6)] def func(d1, d2): return d1.union(d2) expected = [list(range(6)), list(range(6)), list(range(6))] self._test_func(input1, func, expected, input2=input2) def test_cogroup(self): input = [[(1, 1), (2, 1), (3, 1)], [(1, 1), (1, 1), (1, 1), (2, 1)], [("a", 1), ("a", 1), ("b", 1), ("", 1), ("", 1)]] input2 = [[(1, 2)], [(4, 1)], [("a", 1), ("a", 1), ("b", 1), ("", 1), ("", 2)]] def func(d1, d2): return d1.cogroup(d2).mapValues(lambda vs: tuple(map(list, vs))) expected = [[(1, ([1], [2])), (2, ([1], [])), (3, ([1], []))], [(1, ([1, 1, 1], [])), (2, ([1], [])), (4, ([], [1]))], [("a", ([1, 1], [1, 1])), ("b", ([1], [1])), ("", ([1, 1], [1, 2]))]] self._test_func(input, func, expected, sort=True, input2=input2) def test_join(self): input = [[('a', 1), ('b', 2)]] input2 = [[('b', 3), ('c', 4)]] def func(a, b): return a.join(b) expected = [[('b', (2, 3))]] self._test_func(input, func, expected, True, input2) def test_left_outer_join(self): input = [[('a', 1), ('b', 2)]] input2 = [[('b', 3), ('c', 4)]] def func(a, b): return a.leftOuterJoin(b) expected = [[('a', (1, None)), ('b', (2, 3))]] self._test_func(input, func, expected, True, input2) def test_right_outer_join(self): input = [[('a', 1), ('b', 2)]] input2 = [[('b', 3), ('c', 4)]] def func(a, b): return a.rightOuterJoin(b) expected = [[('b', (2, 3)), ('c', (None, 4))]] self._test_func(input, func, expected, True, input2) def test_full_outer_join(self): input = [[('a', 1), ('b', 2)]] input2 = [[('b', 3), ('c', 4)]] def func(a, b): return a.fullOuterJoin(b) expected = [[('a', (1, None)), ('b', (2, 3)), ('c', (None, 4))]] self._test_func(input, func, expected, True, input2) def test_update_state_by_key(self): def updater(vs, s): if not s: s = [] s.extend(vs) return s input = [[('k', i)] for i in range(5)] def func(dstream): return dstream.updateStateByKey(updater) expected = [[0], [0, 1], [0, 1, 2], [0, 1, 2, 3], [0, 1, 2, 3, 4]] expected = [[('k', v)] for v in expected] self._test_func(input, func, expected) def test_update_state_by_key_initial_rdd(self): def updater(vs, s): if not s: s = [] s.extend(vs) return s initial = [('k', [0, 1])] initial = self.sc.parallelize(initial, 1) input = [[('k', i)] for i in range(2, 5)] def func(dstream): return dstream.updateStateByKey(updater, initialRDD=initial) expected = [[0, 1, 2], [0, 1, 2, 3], [0, 1, 2, 3, 4]] expected = [[('k', v)] for v in expected] self._test_func(input, func, expected) def test_failed_func(self): # Test failure in # TransformFunction.apply(rdd: Option[RDD[_]], time: Time) input = [self.sc.parallelize([d], 1) for d in range(4)] input_stream = self.ssc.queueStream(input) def failed_func(i): raise ValueError("This is a special error") input_stream.map(failed_func).pprint() self.ssc.start() try: self.ssc.awaitTerminationOrTimeout(10) except: import traceback failure = traceback.format_exc() self.assertTrue("This is a special error" in failure) return self.fail("a failed func should throw an error") def test_failed_func2(self): # Test failure in # TransformFunction.apply(rdd: Option[RDD[_]], rdd2: Option[RDD[_]], time: Time) input = [self.sc.parallelize([d], 1) for d in range(4)] input_stream1 = self.ssc.queueStream(input) input_stream2 = self.ssc.queueStream(input) def failed_func(rdd1, rdd2): raise ValueError("This is a special error") input_stream1.transformWith(failed_func, input_stream2, True).pprint() self.ssc.start() try: self.ssc.awaitTerminationOrTimeout(10) except: import traceback failure = traceback.format_exc() self.assertTrue("This is a special error" in failure) return self.fail("a failed func should throw an error") def test_failed_func_with_reseting_failure(self): input = [self.sc.parallelize([d], 1) for d in range(4)] input_stream = self.ssc.queueStream(input) def failed_func(i): if i == 1: # Make it fail in the second batch raise ValueError("This is a special error") else: return i # We should be able to see the results of the 3rd and 4th batches even if the second batch # fails expected = [[0], [2], [3]] self.assertEqual(expected, self._collect(input_stream.map(failed_func), 3)) try: self.ssc.awaitTerminationOrTimeout(10) except: import traceback failure = traceback.format_exc() self.assertTrue("This is a special error" in failure) return self.fail("a failed func should throw an error")