def test_distinct(self): xs = [1, 2, 5, 1, 6, 8, 2, 90] d = set(xs) rdd = MockRDD.from_seq(xs).distinct() results = list(rdd) self.assertEqual(len(results), len(d)) self.assertEqual(set(results), d)
def test_reduceByKey(self): kvs = [('a', 1), ('b', 7), ('a', 2)] rdd = MockRDD.from_seq(kvs).reduceByKey(lambda a, b: a + b) results = list(rdd) self.assertEqual(len(results), 2) results = dict(results) self.assertEqual(results, {'a': 3, 'b': 7})
def test_combineByKey(self): kvs = [('a', 1), ('b', 3), ('a', 5), ('b', 3), ('a', 1)] # Want to ensure that both value and combiner merging is performed # It's possible to do everything with only value merging, but if we did # we'd miss bugs in the merge combiner used_merge_value = False used_merge_combiners = False def create_combiner(v): return {v} def merge_value(v, x): nonlocal used_merge_value used_merge_value = True v.add(x) return v def merge_combiners(a, b): nonlocal used_merge_combiners used_merge_combiners = True a.update(b) return a rdd = MockRDD.from_seq(kvs).combineByKey(create_combiner, merge_value, merge_combiners) results = list(rdd) self.assertEqual(len(results), 2) results = dict(results) self.assertEqual(results, {'a': {1, 5}, 'b': {3}}) self.assertTrue(used_merge_value) self.assertTrue(used_merge_combiners)
def test_flatMap(self): x = [1, 2, 3] def func(el): assert el in x for i in range(el): yield i rdd = MockRDD.from_seq(x).flatMap(func) self.assertEqual(list(rdd), [0, 0, 1, 0, 1, 2])
def test_flatMapValues(self): x = [('a', 1), ('b', 3)] def func(el): self.assertIn(el, (1, 3)) for i in range(1, el): yield i rdd = MockRDD.from_seq(x).flatMapValues(func) self.assertEqual(list(rdd), [('b', 1), ('b', 2)])
def test_groupByKey(self): kvs = [('a', 1), ('b', 3), ('a', 5), ('b', 3)] rdd = MockRDD.from_seq(kvs).groupByKey() results = list(rdd) self.assertEqual(len(results), 2) results = dict(results) self.assertEqual(set(results), {'a', 'b'}) self.assertEqual(set(results['a']), {1, 5}) self.assertEqual(results['b'], [3, 3])
def test_aggregate(self): zeroValue = set() def seqO(c, x): c.add(x) return c did_combine = False def combOp(a, b): nonlocal did_combine did_combine = True a.update(b) return a x = [1, 2, 3, 5, 2, 8, 3] result = MockRDD.from_seq(x).aggregate(zeroValue, seqO, combOp) self.assertEqual(result, set(x)) self.assertTrue(did_combine)
def test_count(self): x = [1, 2, 1] self.assertEqual(MockRDD.from_seq(x).count(), 3)
def test_sum(self): x = [1, 5, 2] self.assertEqual(MockRDD.from_seq(x).sum(), 8)
def test_min(self): x = [1, 2, 3, 5, 2, 8, 3] self.assertEqual(MockRDD.from_seq(x).min(), 1)
def test_fold(self): x = [1, 2, 3, 5] i = 7 result = MockRDD.from_seq(x).fold(i, lambda a, b: a + b) self.assertEqual(result, i + sum(x))
from mockrdd import MockRDD from invalid_key_value_pairs import job logs = ['server0,1539015865,127.0.0.1,/index.html'] results = job(MockRDD.from_seq(logs)) print(results)
def test_reduce(self): x = [1, 2, 3, 5] result = MockRDD.from_seq(x).reduce(lambda a, b: a + b) self.assertEqual(result, sum(x))
from mockrdd import MockRDD from invalid_callable import count_distinct_servers logs = [ 'server0,1539015865,127.0.0.1,/index.html', 'server0,1539015866,127.0.0.1,/index.html' ] results = count_distinct_servers(MockRDD.from_seq(logs)) print(results)
def test_mapValues(self): x = [('a', 1), ('b', 3)] rdd = MockRDD.from_seq(x).mapValues(lambda a: a + 1) self.assertEqual(list(rdd), [('a', 2), ('b', 4)])
def test_countByValue(self): x = [1, 3, 1] self.assertEqual(MockRDD.from_seq(x).countByValue(), {1: 2, 3: 1})
def test_keyBy(self): xs = [1, 2] rdd = MockRDD.from_seq(xs).keyBy(lambda x: x % 2) results = list(rdd) self.assertEqual(results, [(1, 1), (0, 2)])
def test_values(self): kvs = [(1, 2), (3, 4)] rdd = MockRDD.from_seq(kvs).values() results = list(rdd) self.assertEqual(results, [2, 4])
def test_keys(self): kvs = [(1, 2), (3, 4)] rdd = MockRDD.from_seq(kvs).keys() results = list(rdd) self.assertEqual(results, [1, 3])
def test_union(self): x = [1, 2] y = [7, 8] rdd = MockRDD.from_seq(x).union(MockRDD.from_seq(y)) self.assertEqual(list(rdd), x + y)
def test_countByKey(self): x = [(1, 'a'), (3, 'a'), (1, 'b')] self.assertEqual(MockRDD.from_seq(x).countByKey(), {1: 2, 3: 1})
from mockrdd import MockRDD from invalid_flatmap import count_distinct_timestamps logs = ['server0,1539015865,127.0.0.1,/index.html', 'server0,1539015866,127.0.0.1,/index.html'] results = count_distinct_timestamps(MockRDD.from_seq(logs)) print(results)