Beispiel #1
0
 def test_group_by_key(self):
     l = [(1, 1), (2, 1), (2, 2), (3, 1), (3, 2), (3, 3)]
     rdd = RDD(l, self.SPARK_CONTEXT)
     rdd = rdd.groupByKey()
     r = rdd.collect()
     r = [(kv[0], list(kv[1])) for kv in r]
     self.assertEquals(sorted(r), sorted([(1, [1]), (2, [1, 2]), (3, [1, 2, 3])]))
Beispiel #2
0
 def test_filter(self):
     for start, stop, step in self.TEST_RANGES:
         l1 = range(start, stop, step)
         l2 = filter(RDDTests.is_square, l1)
         rdd = RDD(list(l1), self.SPARK_CONTEXT)
         rdd = rdd.filter(RDDTests.is_square)
         self.assertEquals(rdd.collect(), list(l2))
Beispiel #3
0
 def test_filter(self):
     for start, stop, step in self.TEST_RANGES:
         l1 = range(start, stop, step)
         l2 = filter(RDDTests.is_square, l1)
         rdd = RDD(list(l1), self.SPARK_CONTEXT)
         rdd = rdd.filter(RDDTests.is_square)
         self.assertEquals(rdd.collect(), list(l2))
Beispiel #4
0
 def test_group_by_key(self):
     l = [(1, 1), (2, 1), (2, 2), (3, 1), (3, 2), (3, 3)]
     rdd = RDD(l, self.SPARK_CONTEXT)
     rdd = rdd.groupByKey()
     r = rdd.collect()
     r = [(kv[0], list(kv[1])) for kv in r]
     self.assertEquals(sorted(r), sorted([(1, [1]), (2, [1, 2]), (3, [1, 2, 3])]))
Beispiel #5
0
    def test_reduce_by_key(self):
        l = [(1, 1), (2, 1), (2, 2), (3, 1), (3, 2), (3, 3)]
        rdd = RDD(l, self.SPARK_CONTEXT)
        rdd = rdd.reduceByKey(lambda a, b: a + b)

        print(rdd)

        self.assertEquals(sorted(rdd.collect()), sorted([(1, 1), (2, 3), (3, 6)]))
Beispiel #6
0
 def test_sample_with_replacement(self):
     for start, stop, step in self.TEST_RANGES:
         l = range(start, stop, step)
         rdd = RDD(list(l), self.SPARK_CONTEXT)
         sample = rdd.sample(True, self.SAMPLE_FRACTION).collect()
         self.assertEquals(len(sample), int(len(l) * self.SAMPLE_FRACTION))
         for item in sample:
             self.assertTrue(item in l)
Beispiel #7
0
 def test_sample_with_replacement(self):
     for start, stop, step in self.TEST_RANGES:
         l = range(start, stop, step)
         rdd = RDD(list(l), self.SPARK_CONTEXT)
         sample = rdd.sample(True, self.SAMPLE_FRACTION).collect()
         self.assertEquals(len(sample), int(len(l) * self.SAMPLE_FRACTION))
         for item in sample:
             self.assertTrue(item in l)
Beispiel #8
0
 def test_intersection(self):
     for start1, stop1, step1 in self.TEST_RANGES:
         for start2, stop2, step2 in self.TEST_RANGES:
             l1 = range(start1, stop1, step1)
             l2 = range(start2, stop2, step2)
             rdd1 = RDD(list(l1), self.SPARK_CONTEXT)
             rdd2 = RDD(list(l2), self.SPARK_CONTEXT)
             rdd = rdd1.intersection(rdd2)
             self.assertEquals(sorted(rdd.collect()), sorted([x for x in l1 if x in l2]))
Beispiel #9
0
 def test_intersection(self):
     for start1, stop1, step1 in self.TEST_RANGES:
         for start2, stop2, step2 in self.TEST_RANGES:
             l1 = range(start1, stop1, step1)
             l2 = range(start2, stop2, step2)
             rdd1 = RDD(list(l1), self.SPARK_CONTEXT)
             rdd2 = RDD(list(l2), self.SPARK_CONTEXT)
             rdd = rdd1.intersection(rdd2)
             self.assertEquals(sorted(rdd.collect()), sorted([x for x in l1 if x in l2]))
Beispiel #10
0
 def test_union(self):
     for start1, stop1, step1 in self.TEST_RANGES:
         for start2, stop2, step2 in self.TEST_RANGES:
             l1 = range(start1, stop1, step1)
             l2 = range(start2, stop2, step2)
             rdd1 = RDD(list(l1), self.SPARK_CONTEXT)
             rdd2 = RDD(list(l2), self.SPARK_CONTEXT)
             rdd = rdd1.union(rdd2)
             self.assertEquals(sorted(rdd.collect()), sorted(list(l1) + list(l2)))
Beispiel #11
0
 def test_union(self):
     for start1, stop1, step1 in self.TEST_RANGES:
         for start2, stop2, step2 in self.TEST_RANGES:
             l1 = range(start1, stop1, step1)
             l2 = range(start2, stop2, step2)
             rdd1 = RDD(list(l1), self.SPARK_CONTEXT)
             rdd2 = RDD(list(l2), self.SPARK_CONTEXT)
             rdd = rdd1.union(rdd2)
             self.assertEquals(sorted(rdd.collect()), sorted(list(l1) + list(l2)))
Beispiel #12
0
 def test_distinct(self):
     for start, stop, step in self.TEST_RANGES:
         l = range(start, stop, step)
         rdd = RDD(list(l), self.SPARK_CONTEXT)
         rdd = rdd.map(RDDTests.return_one)
         rdd = rdd.distinct()
         if len(l) > 0:
             self.assertEquals(rdd.collect(), [1])
         else:
             self.assertEquals(rdd.collect(), [])
Beispiel #13
0
 def test_flat_map(self):
     for start, stop, step in self.TEST_RANGES:
         l1 = range(start, stop, step)
         l2 = map(RDDTests.triplicate, l1)
         l3 = []
         for sl in l2:
             l3.extend(sl)
         rdd = RDD(list(l1), self.SPARK_CONTEXT)
         rdd = rdd.flatMap(RDDTests.triplicate)
         self.assertEquals(rdd.collect(), list(l3))
Beispiel #14
0
 def test_flat_map(self):
     for start, stop, step in self.TEST_RANGES:
         l1 = range(start, stop, step)
         l2 = map(RDDTests.triplicate, l1)
         l3 = []
         for sl in l2:
             l3.extend(sl)
         rdd = RDD(list(l1), self.SPARK_CONTEXT)
         rdd = rdd.flatMap(RDDTests.triplicate)
         self.assertEquals(rdd.collect(), list(l3))
Beispiel #15
0
 def test_cogroup(self):
     l1 = [(1, 1), (2, 1), (2, 2), (3, 1), (3, 2), (3, 3)]
     l2 = [(2, 10), (2, 20), (3, 10), (3, 20), (3, 30), (4, 40)]
     rdd1 = RDD(l1, self.SPARK_CONTEXT)
     rdd2 = RDD(l2, self.SPARK_CONTEXT)
     rdd = rdd1.cogroup(rdd2)
     l = rdd.collect()
     self.assertEquals(
         sorted(l),
         sorted([(1, [1], []), (2, [1, 2], [10, 20]), (3, [1, 2, 3], [10, 20, 30]), (4, [], [40])])
     )
Beispiel #16
0
 def test_cogroup(self):
     l1 = [(1, 1), (2, 1), (2, 2), (3, 1), (3, 2), (3, 3)]
     l2 = [(2, 10), (2, 20), (3, 10), (3, 20), (3, 30), (4, 40)]
     rdd1 = RDD(l1, self.SPARK_CONTEXT)
     rdd2 = RDD(l2, self.SPARK_CONTEXT)
     rdd = rdd1.cogroup(rdd2)
     l = rdd.collect()
     self.assertEquals(
         sorted(l),
         sorted([(1, [1], []), (2, [1, 2], [10, 20]),
                 (3, [1, 2, 3], [10, 20, 30]), (4, [], [40])]))
Beispiel #17
0
 def test_sample_without_replacement_with_seed(self):
     for start, stop, step in self.TEST_RANGES:
         l = range(start, stop, step)
         rdd = RDD(list(l), self.SPARK_CONTEXT)
         sample1 = rdd.sample(False, self.SAMPLE_FRACTION, self.SAMPLE_SEED).collect()
         sample2 = rdd.sample(False, self.SAMPLE_FRACTION, self.SAMPLE_SEED).collect()
         self.assertEquals(sorted(sample1), sorted(sample2))
         sample = sample1
         self.assertEquals(len(sample), int(len(l) * self.SAMPLE_FRACTION))
         self.assertEquals(sorted(l), sorted(set(l)))
         for item in sample:
             self.assertTrue(item in l)
Beispiel #18
0
 def test_sample_without_replacement_with_seed(self):
     for start, stop, step in self.TEST_RANGES:
         l = range(start, stop, step)
         rdd = RDD(list(l), self.SPARK_CONTEXT)
         sample1 = rdd.sample(False, self.SAMPLE_FRACTION, self.SAMPLE_SEED).collect()
         sample2 = rdd.sample(False, self.SAMPLE_FRACTION, self.SAMPLE_SEED).collect()
         self.assertEquals(sorted(sample1), sorted(sample2))
         sample = sample1
         self.assertEquals(len(sample), int(len(l) * self.SAMPLE_FRACTION))
         self.assertEquals(sorted(l), sorted(set(l)))
         for item in sample:
             self.assertTrue(item in l)
Beispiel #19
0
 def test_cartesian(self):
     for start1, stop1, step1 in self.TEST_RANGES:
         for start2, stop2, step2 in self.TEST_RANGES:
             l1 = range(start1, stop1, step1)
             l2 = range(start2, stop2, step2)
             rdd1 = RDD(list(l1), self.SPARK_CONTEXT)
             rdd2 = RDD(list(l2), self.SPARK_CONTEXT)
             rdd = rdd1.cartesian(rdd2)
             r = rdd.collect()
             self.assertEquals(len(r), len(l1) * len(l2))
             for t, u in r:
                 self.assertTrue(t in l1)
                 self.assertTrue(u in l2)
Beispiel #20
0
 def test_cartesian(self):
     for start1, stop1, step1 in self.TEST_RANGES:
         for start2, stop2, step2 in self.TEST_RANGES:
             l1 = range(start1, stop1, step1)
             l2 = range(start2, stop2, step2)
             rdd1 = RDD(list(l1), self.SPARK_CONTEXT)
             rdd2 = RDD(list(l2), self.SPARK_CONTEXT)
             rdd = rdd1.cartesian(rdd2)
             r = rdd.collect()
             self.assertEquals(len(r), len(l1) * len(l2))
             for t, u in r:
                 self.assertTrue(t in l1)
                 self.assertTrue(u in l2)
Beispiel #21
0
    def test_init(self):
        for start, stop, step in self.TEST_RANGES:
            l = list(range(start, stop, step))
            rdd = RDD(l, self.SPARK_CONTEXT)
            self.assertEquals(l, rdd.collect())

            s = set(range(100))
            rdd = RDD(s, self.SPARK_CONTEXT)
            self.assertEquals(sorted(list(s)), sorted(rdd.collect()))

        t = (1, 2, 3)
        with self.assertRaises(AttributeError):
            RDD(t, self.SPARK_CONTEXT)

        with self.assertRaises(AttributeError):
            RDD('', self.SPARK_CONTEXT)
Beispiel #22
0
    def test_init(self):
        for start, stop, step in self.TEST_RANGES:
            l = list(range(start, stop, step))
            rdd = RDD(l, self.SPARK_CONTEXT)
            self.assertEquals(l, rdd.collect())

            s = set(range(100))
            rdd = RDD(s, self.SPARK_CONTEXT)
            self.assertEquals(sorted(list(s)), sorted(rdd.collect()))

        t = (1, 2, 3)
        with self.assertRaises(AttributeError):
            RDD(t, self.SPARK_CONTEXT)

        with self.assertRaises(AttributeError):
            RDD('', self.SPARK_CONTEXT)
Beispiel #23
0
 def test_distinct(self):
     for start, stop, step in self.TEST_RANGES:
         l = range(start, stop, step)
         rdd = RDD(list(l), self.SPARK_CONTEXT)
         rdd = rdd.map(RDDTests.return_one)
         rdd = rdd.distinct()
         if len(l) > 0:
             self.assertEquals(rdd.collect(), [1])
         else:
             self.assertEquals(rdd.collect(), [])
Beispiel #24
0
 def test_reduce_by_key(self):
     l = [(1, 1), (2, 1), (2, 2), (3, 1), (3, 2), (3, 3)]
     rdd = RDD(l, self.SPARK_CONTEXT)
     rdd = rdd.reduceByKey(lambda a, b: a + b)
     self.assertEquals(sorted(rdd.collect()), sorted([(1, 1), (2, 3), (3, 6)]))
Beispiel #25
0
 def test_ctx(self):
     rdd = RDD([], self.SPARK_CONTEXT)
     self.assertEquals(rdd.ctx, self.SPARK_CONTEXT)