コード例 #1
0
    def create_context(n_processes=0):
        if not n_processes:
            return fast_pyspark_tester.Context()

        pool = futures.ProcessPoolExecutor(n_processes)
        return fast_pyspark_tester.Context(
            pool=pool,
            serializer=cloudpickle.dumps,
            # serializer=pickle.dumps,
            deserializer=pickle.loads,
        )
コード例 #2
0
 def runtime(self, n=10, processes=1):
     start = time.time()
     with futures.ProcessPoolExecutor(processes) as pool:
         sc = fast_pyspark_tester.Context(pool=pool, serializer=cloudpickle.dumps, deserializer=pickle.loads)
         rdd = sc.parallelize(range(n), 10)
         rdd.map(lambda _: time.sleep(0.1)).collect()
     return time.time() - start
コード例 #3
0
 def test_union(self):
     sc = fast_pyspark_tester.Context()
     rdd1 = sc.parallelize(['Hello'])
     rdd2 = sc.parallelize(['World'])
     union = sc.union([rdd1, rdd2]).collect()
     print(union)
     self.assertEqual(union, ['Hello', 'World'])
コード例 #4
0
 def test_lock1(self):
     """Should not be able to create a new RDD inside a map operation."""
     sc = fast_pyspark_tester.Context()
     self.assertRaises(
         fast_pyspark_tester.exceptions.ContextIsLockedException,
         lambda: (sc.parallelize(range(5)).map(lambda _: sc.parallelize([1])
                                               ).collect()),
     )
コード例 #5
0
def main():
    tempFile = tempfile.NamedTemporaryFile(delete=True)
    tempFile.close()

    sc = fast_pyspark_tester.Context()
    sc.parallelize(range(1000000)).saveAsTextFile(tempFile.name + '.gz')
    rdd = sc.textFile(tempFile.name + '.gz')
    rdd.collect()
コード例 #6
0
    def test_connect(self):
        sc = fast_pyspark_tester.Context()
        ssc = fast_pyspark_tester.streaming.StreamingContext(sc, 0.1)

        result = []
        (ssc.textFileStream(LICENSE_FILE, process_all=True).count().foreachRDD(
            lambda rdd: result.append(rdd.collect()[0])))

        ssc.start()
        ssc.awaitTermination(timeout=0.3)
        self.assertEqual(sum(result), 44)
コード例 #7
0
    def test_lock2(self):
        """Should not be able to create RDDs containing RDDs."""
        sc = fast_pyspark_tester.Context()

        def parallelize_in_parallelize():
            o = sc.parallelize(sc.parallelize(range(x)) for x in range(5))
            print(o.map(lambda x: x.collect()).collect())

        self.assertRaises(
            fast_pyspark_tester.exceptions.ContextIsLockedException,
            parallelize_in_parallelize,
        )
コード例 #8
0
    def test_read_chunks(self):
        sc = fast_pyspark_tester.Context()
        ssc = fast_pyspark_tester.streaming.StreamingContext(sc, 0.1)

        result = []
        (ssc.fileBinaryStream(LICENSE_FILE, recordLength=40,
                              process_all=True).count().foreachRDD(
                                  lambda rdd: result.append(rdd.collect()[0])))

        ssc.start()
        ssc.awaitTermination(timeout=0.3)
        self.assertEqual(sum(result), 55)
コード例 #9
0
def test_cache_empty_partition():
    m = Manip()

    c = fast_pyspark_tester.Context()
    rdd = c.parallelize(range(10), 2)
    rdd = rdd.map(m.trivial_manip_with_debug)
    rdd = rdd.filter(lambda e: e > 6).cache()
    print(rdd.collect())
    print(rdd.collect())

    print('count of map executions: {}'.format(m.count))
    assert m.count == 10
コード例 #10
0
    def test_main(self):
        sc = fast_pyspark_tester.Context()
        ssc = fast_pyspark_tester.streaming.StreamingContext(sc, 0.1)

        counter = Counter()
        (ssc.socketBinaryStream(
            '127.0.0.1', 8125,
            length='<I').foreachRDD(lambda rdd: counter.update(rdd.collect())))
        self.client()

        ssc.start()
        ssc.awaitTermination(timeout=0.3)
        self.assertEqual(counter[b'hellohello'], 1)
コード例 #11
0
    def test_connect(self):
        sc = fast_pyspark_tester.Context()
        ssc = fast_pyspark_tester.streaming.StreamingContext(sc, 0.1)

        counter = Counter()
        (ssc.socketTextStream(
            '127.0.0.1',
            8123).foreachRDD(lambda rdd: counter.update(''.join(rdd.collect()))
                             if rdd.collect() else None))
        self.client()

        ssc.start()
        ssc.awaitTermination(timeout=0.3)
        self.assertEqual(counter['a'], 20)
コード例 #12
0
    def test_mapValues(self):
        sc = fast_pyspark_tester.Context()
        ssc = fast_pyspark_tester.streaming.StreamingContext(sc, 0.1)

        result = []
        (
            ssc.queueStream([[('a', [5, 8, 2]), ('b', [6, 3, 8])]])
            .mapValues(sorted)
            .foreachRDD(lambda rdd: result.append(rdd.collect()))
        )

        ssc.start()
        ssc.awaitTermination(timeout=0.15)
        self.assertEqual(result, [[('a', [2, 5, 8]), ('b', [3, 6, 8])]])
コード例 #13
0
    def test_retry(self):
        class EverySecondCallFails(object):
            def __init__(self):
                self.attempt = 0

            def __call__(self, value):
                self.attempt += 1
                if self.attempt % 2 == 1:
                    raise Exception
                return value

        data = list(range(6))
        rdd = fast_pyspark_tester.Context().parallelize(data, 3)
        result = rdd.mapPartitions(EverySecondCallFails()).collect()
        self.assertEqual(result, data)
コード例 #14
0
    def test_groupByKey(self):
        sc = fast_pyspark_tester.Context()
        ssc = fast_pyspark_tester.streaming.StreamingContext(sc, 0.1)

        result = []
        (
            ssc.queueStream([[('a', 5), ('b', 8), ('a', 2)], [('a', 2), ('b', 3)]])
            .groupByKey()
            .mapPartitions(sorted)
            .mapValues(sorted)
            .foreachRDD(lambda rdd: result.append(rdd.collect()))
        )

        ssc.start()
        ssc.awaitTermination(timeout=0.25)
        self.assertEqual(result, [[('a', [2, 5]), ('b', [8])], [('a', [2]), ('b', [3])]])
コード例 #15
0
    def run(self, n=2000, to_kv=None, format_='hello'):
        c = fast_pyspark_tester.Context()
        stream_c = fast_pyspark_tester.streaming.StreamingContext(c, 1.0)

        counts = []
        sensor_sums = defaultdict(float)
        sensor_squares = defaultdict(float)
        sensor_counts = defaultdict(int)
        if format_ not in ('bello', 'struct'):
            t = stream_c.socketTextStream('localhost', self.port)
        else:
            length = {'bello': 5, 'struct': 8}[format_]
            t = stream_c.socketBinaryStream('localhost', self.port, length)
        t.count().foreachRDD(lambda _, rdd: counts.append(rdd.collect()[0]))
        if to_kv is not None:

            def update(rdd):
                for k, v in rdd.collect():
                    sensor_sums[k] += sum(v)
                    sensor_squares[k] += sum(vv**2 for vv in v)
                    sensor_counts[k] += len(v)

            t.map(to_kv).groupByKey().foreachRDD(lambda _, rdd: update(rdd))

        self.client(n, format_=format_)

        stream_c.start()
        stream_c.awaitTermination(timeout=5.0)

        result = max(counts) if counts else 0
        sensor_expections = {
            # expectation of X and X^2
            k: (sensor_sums[k] / v, sensor_squares[k] / v)
            for k, v in sensor_counts.items()
        }
        sensors = {
            k: (ex_ex2[0], math.sqrt(ex_ex2[1] - ex_ex2[0]**2))
            for k, ex_ex2 in sensor_expections.items()
        }
        print('run: n = {}, counts = {}, result = {}'
              ''.format(n, counts, result))
        print('sensors = {}'.format(sensors))
        time.sleep(self.pause)
        self.port += 1
        return result
コード例 #16
0
    def test_count(self):
        sc = fast_pyspark_tester.Context()
        ssc = fast_pyspark_tester.streaming.StreamingContext(sc, 0.1)

        result = []
        (
            ssc.queueStream([range(20), ['a', 'b'], ['c']])
            .count()
            .foreachRDD(lambda rdd: result.append(rdd.collect()[0]))
        )

        ssc.start()
        if platform.system() == 'Windows':
            # Windows is freakingly slow! So we need a higher timeout there...
            ssc.awaitTermination(timeout=1.0)
        else:
            ssc.awaitTermination(timeout=0.3)
        self.assertEqual(sum(result), 23)
コード例 #17
0
def test_timed_cache():
    m = Manip()

    # create a timed cache manager
    cm = fast_pyspark_tester.TimedCacheManager(timeout=1.0)

    # create a cache entry
    c = fast_pyspark_tester.Context(cache_manager=cm)
    rdd = c.parallelize(range(10), 2)
    rdd = rdd.map(m.trivial_manip_with_debug).cache()
    print(rdd.collect())
    # make sure the cache is working
    count_before = m.count
    print(rdd.collect())
    count_after = m.count
    assert count_before == count_after

    # wait to have the cache expire
    time.sleep(1.5)
    cm.gc()
    print(rdd.collect())
    assert m.count > count_after
コード例 #18
0
def main():
    sc = fast_pyspark_tester.Context()
    ssc = fast_pyspark_tester.streaming.StreamingContext(sc, 1)
    ssc.textFileStream('/var/log/system.log*').pprint()
    ssc.start()
    ssc.awaitTermination(timeout=3.0)
コード例 #19
0
def read_csv(filename):
    c = fast_pyspark_tester.Context()
    r = c.textFile(filename)
    r = r.map(lambda l: l + 'something else')
    print(r.count())
コード例 #20
0
    def test_save_gz(self):
        sc = fast_pyspark_tester.Context()
        ssc = fast_pyspark_tester.streaming.StreamingContext(sc, 0.1)

        (ssc.textFileStream(LICENSE_FILE).count().saveAsTextFiles(
            'tests/textout/', suffix='.gz'))
コード例 #21
0
 def test_parallelize_single_element(self):
     my_rdd = fast_pyspark_tester.Context().parallelize([7], 100)
     self.assertEqual(my_rdd.collect(), [7])
コード例 #22
0
 def setUp(self):
     self.pool = futures.ThreadPoolExecutor(4)
     self.sc = fast_pyspark_tester.Context(pool=self.pool)
コード例 #23
0
 def setUp(self):
     self.sc = fast_pyspark_tester.Context()
コード例 #24
0
 def setUp(self):
     self.pool = multiprocessing.Pool(4)
     self.sc = fast_pyspark_tester.Context(pool=self.pool)
コード例 #25
0
 def setUp(self):
     self.pool = multiprocessing.Pool(4)
     self.sc = fast_pyspark_tester.Context(pool=self.pool, serializer=cloudpickle.dumps, deserializer=pickle.loads)
コード例 #26
0
 def test_parallelize_matched_elements(self):
     my_rdd = fast_pyspark_tester.Context().parallelize([1, 2, 3, 4, 5], 5)
     self.assertEqual(my_rdd.collect(), [1, 2, 3, 4, 5])
コード例 #27
0
 def test_parallelize_empty_partitions_at_end(self):
     my_rdd = fast_pyspark_tester.Context().parallelize(range(3529), 500)
     print(my_rdd.getNumPartitions())
     my_rdd.foreachPartition(lambda p: print(sum(1 for _ in p)))
     self.assertEqual(my_rdd.getNumPartitions(), 500)
     self.assertEqual(my_rdd.count(), 3529)
コード例 #28
0
 def setUp(self):
     self.pool = futures.ProcessPoolExecutor(4)
     self.sc = fast_pyspark_tester.Context(pool=self.pool, serializer=cloudpickle.dumps, deserializer=pickle.loads)
コード例 #29
0
 def test_broadcast(self):
     b = fast_pyspark_tester.Context().broadcast([1, 2, 3])
     self.assertEqual(b.value[0], 1)
コード例 #30
0
 def test_version(self):
     self.assertTrue(isinstance(fast_pyspark_tester.Context().version, str))