def test_distinct(self): for start, stop, step in self.TEST_RANGES: l = range(start, stop, step) rdd = RDD(list(l), self.SPARK_CONTEXT) rdd = rdd.map(RDDTests.return_one) rdd = rdd.distinct() if len(l) > 0: self.assertEquals(rdd.collect(), [1]) else: self.assertEquals(rdd.collect(), [])
def test_filter(self): for start, stop, step in self.TEST_RANGES: l1 = range(start, stop, step) l2 = filter(RDDTests.is_square, l1) rdd = RDD(list(l1), self.SPARK_CONTEXT) rdd = rdd.filter(RDDTests.is_square) self.assertEquals(rdd.collect(), list(l2))
def test_init(self): for start, stop, step in self.TEST_RANGES: l = list(range(start, stop, step)) rdd = RDD(l, self.SPARK_CONTEXT) self.assertEquals(l, rdd.collect()) s = set(range(100)) rdd = RDD(s, self.SPARK_CONTEXT) self.assertEquals(sorted(list(s)), sorted(rdd.collect())) t = (1, 2, 3) with self.assertRaises(AttributeError): RDD(t, self.SPARK_CONTEXT) with self.assertRaises(AttributeError): RDD('', self.SPARK_CONTEXT)
def test_group_by_key(self): l = [(1, 1), (2, 1), (2, 2), (3, 1), (3, 2), (3, 3)] rdd = RDD(l, self.SPARK_CONTEXT) rdd = rdd.groupByKey() r = rdd.collect() r = [(kv[0], list(kv[1])) for kv in r] self.assertEquals(sorted(r), sorted([(1, [1]), (2, [1, 2]), (3, [1, 2, 3])]))
def test_reduce_by_key(self): l = [(1, 1), (2, 1), (2, 2), (3, 1), (3, 2), (3, 3)] rdd = RDD(l, self.SPARK_CONTEXT) rdd = rdd.reduceByKey(lambda a, b: a + b) print(rdd) self.assertEquals(sorted(rdd.collect()), sorted([(1, 1), (2, 3), (3, 6)]))
def test_flat_map(self): for start, stop, step in self.TEST_RANGES: l1 = range(start, stop, step) l2 = map(RDDTests.triplicate, l1) l3 = [] for sl in l2: l3.extend(sl) rdd = RDD(list(l1), self.SPARK_CONTEXT) rdd = rdd.flatMap(RDDTests.triplicate) self.assertEquals(rdd.collect(), list(l3))
def test_reduce_by_key(self): l = [(1, 1), (2, 1), (2, 2), (3, 1), (3, 2), (3, 3)] rdd = RDD(l, self.SPARK_CONTEXT) rdd = rdd.reduceByKey(lambda a, b: a + b) self.assertEquals(sorted(rdd.collect()), sorted([(1, 1), (2, 3), (3, 6)]))