def create_context(n_processes=0): if not n_processes: return gelanis.Context() pool = futures.ProcessPoolExecutor(n_processes) return gelanis.Context( pool=pool, serializer=cloudpickle.dumps, # serializer=pickle.dumps, deserializer=pickle.loads)
def test_lock1(self): """Should not be able to create a new RDD inside a map operation.""" sc = gelanis.Context() self.assertRaises( gelanis.exceptions.ContextIsLockedException, lambda: (sc.parallelize(range(5)).map(lambda _: sc.parallelize([1])). collect()))
def _run_process(self, n, to_kv, format_): c = gelanis.Context() stream_c = gelanis.streaming.StreamingContext(c, 1.0) counts = [] sensor_sums = defaultdict(float) sensor_squares = defaultdict(float) sensor_counts = defaultdict(int) if format_ not in ('bello', 'struct'): t = stream_c.socketTextStream('localhost', self.port) else: length = {'bello': 5, 'struct': 8}[format_] t = stream_c.socketBinaryStream('localhost', self.port, length) t.count().foreachRDD(lambda _, rdd: counts.append(rdd.collect()[0])) if to_kv is not None: def update(rdd): for k, v in rdd.collect(): sensor_sums[k] += sum(v) sensor_squares[k] += sum(vv**2 for vv in v) sensor_counts[k] += len(v) t.map(to_kv).groupByKey().foreachRDD(lambda _, rdd: update(rdd)) self.client(n, format_=format_) stream_c.start() stream_c.awaitTermination(timeout=5.0) return (counts, sensor_sums, sensor_squares, sensor_counts)
def test_union(self): sc = gelanis.Context() rdd1 = sc.parallelize(['Hello']) rdd2 = sc.parallelize(['World']) union = sc.union([rdd1, rdd2]).collect() print(union) self.assertEqual(union, ['Hello', 'World'])
def main(): tempFile = tempfile.NamedTemporaryFile(delete=True) tempFile.close() sc = gelanis.Context() sc.parallelize(range(1000000)).saveAsTextFile(tempFile.name + '.gz') rdd = sc.textFile(tempFile.name + '.gz') rdd.collect()
def test_lock2(self): """Should not be able to create RDDs containing RDDs.""" sc = gelanis.Context() def parallelize_in_parallelize(): o = sc.parallelize(sc.parallelize(range(x)) for x in range(5)) print(o.map(lambda x: x.collect()).collect()) self.assertRaises(gelanis.exceptions.ContextIsLockedException, parallelize_in_parallelize)
def test_mapValues(self): sc = gelanis.Context() ssc = gelanis.streaming.StreamingContext(sc, 0.1) result = [] (ssc.queueStream([[('a', [5, 8, 2]), ('b', [6, 3, 8])]]).mapValues( sorted).foreachRDD(lambda rdd: result.append(rdd.collect()))) ssc.start() ssc.awaitTermination(timeout=0.15) self.assertEqual(result, [[('a', [2, 5, 8]), ('b', [3, 6, 8])]])
def test_connect(self): sc = gelanis.Context() ssc = gelanis.streaming.StreamingContext(sc, 0.1) result = [] (ssc.textFileStream('LICENS*', process_all=True).count().foreachRDD( lambda rdd: result.append(rdd.collect()[0]))) ssc.start() ssc.awaitTermination(timeout=0.3) self.assertEqual(sum(result), 44)
def test_cache_empty_partition(): m = Manip() c = gelanis.Context() rdd = c.parallelize(range(10), 2) rdd = rdd.map(m.trivial_manip_with_debug) rdd = rdd.filter(lambda e: e > 6).cache() print(rdd.collect()) print(rdd.collect()) print(f'count of map executions: {m.count}') assert m.count == 10
def test_read_chunks(self): sc = gelanis.Context() ssc = gelanis.streaming.StreamingContext(sc, 0.1) result = [] (ssc.fileBinaryStream('LICENS*', recordLength=40, process_all=True).count().foreachRDD( lambda rdd: result.append(rdd.collect()[0]))) ssc.start() ssc.awaitTermination(timeout=0.3) self.assertEqual(sum(result), 54)
def test_count(self): sc = gelanis.Context() ssc = gelanis.streaming.StreamingContext(sc, 0.1) result = [] (ssc.queueStream([ range(20), ['a', 'b'], ['c'] ]).count().foreachRDD(lambda rdd: result.append(rdd.collect()[0]))) ssc.start() ssc.awaitTermination(timeout=0.35) self.assertEqual(sum(result), 23)
def test_main(self): sc = gelanis.Context() ssc = gelanis.streaming.StreamingContext(sc, 0.1) counter = Counter() ( ssc.socketBinaryStream('127.0.0.1', 8125, length='<I') .foreachRDD(lambda rdd: counter.update(rdd.collect())) ) self.client() ssc.start() ssc.awaitTermination(timeout=0.3) self.assertEqual(counter[b'hellohello'], 1)
def test_groupByKey(self): sc = gelanis.Context() ssc = gelanis.streaming.StreamingContext(sc, 0.1) result = [] (ssc.queueStream([[('a', 5), ('b', 8), ('a', 2)], [ ('a', 2), ('b', 3) ]]).groupByKey().mapPartitions(sorted).mapValues(sorted).foreachRDD( lambda rdd: result.append(rdd.collect()))) ssc.start() ssc.awaitTermination(timeout=0.25) self.assertEqual(result, [[('a', [2, 5]), ('b', [8])], [('a', [2]), ('b', [3])]])
def test_retry(self): class EverySecondCallFails: def __init__(self): self.attempt = 0 def __call__(self, value): self.attempt += 1 if self.attempt % 2 == 1: raise Exception return value data = list(range(6)) rdd = gelanis.Context().parallelize(data, 3) result = rdd.mapPartitions(EverySecondCallFails()).collect() self.assertEqual(result, data)
def test_connect(self): sc = gelanis.Context() ssc = gelanis.streaming.StreamingContext(sc, 0.1) counter = Counter() ( ssc.socketTextStream('127.0.0.1', 8123) .foreachRDD(lambda rdd: counter.update(''.join(rdd.collect())) if rdd.collect() else None) ) self.client() ssc.start() ssc.awaitTermination(timeout=0.3) self.assertEqual(counter['a'], 20)
def test_timed_cache(): m = Manip() # create a timed cache manager cm = gelanis.TimedCacheManager(timeout=1.0) # create a cache entry c = gelanis.Context(cache_manager=cm) rdd = c.parallelize(range(10), 2) rdd = rdd.map(m.trivial_manip_with_debug).cache() print(rdd.collect()) # make sure the cache is working count_before = m.count print(rdd.collect()) count_after = m.count assert count_before == count_after # wait to have the cache expire time.sleep(1.5) cm.gc() print(rdd.collect()) assert m.count > count_after
def test_parallelize_matched_elements(self): my_rdd = gelanis.Context().parallelize([1, 2, 3, 4, 5], 5) self.assertEqual(my_rdd.collect(), [1, 2, 3, 4, 5])
def main(): sc = gelanis.Context() ssc = gelanis.streaming.StreamingContext(sc, 1) ssc.textFileStream('/var/log/system.log*').pprint() ssc.start() ssc.awaitTermination(timeout=3.0)
def read_csv(filename): c = gelanis.Context() r = c.textFile(filename) r = r.map(lambda l: l + 'something else') print(r.count())
def setUp(self): self.pool = multiprocessing.Pool(4) self.sc = gelanis.Context(pool=self.pool, serializer=cloudpickle.dumps, deserializer=pickle.loads)
def setUp(self): self.pool = multiprocessing.Pool(4) self.sc = gelanis.Context(pool=self.pool)
def test_save_gz(self): sc = gelanis.Context() ssc = gelanis.streaming.StreamingContext(sc, 0.1) (ssc.textFileStream('LICENS*').count().saveAsTextFiles( 'tests/textout/', suffix='.gz'))
def test_trivial_sample(): rdd = gelanis.Context().parallelize(range(1000), 1000) sampled = rdd.sample(False, 0.01, 42).collect() print(sampled) assert sampled == [97, 164, 294, 695, 807, 864, 911]
def setUp(self): self.sc = gelanis.Context()
def test_broadcast(self): b = gelanis.Context().broadcast([1, 2, 3]) self.assertEqual(b.value[0], 1)
def test_version(self): self.assertIsInstance(gelanis.Context().version, str)
def setUp(self): self.pool = futures.ThreadPoolExecutor(4) self.sc = gelanis.Context(pool=self.pool)
def test_parallelize_single_element(self): my_rdd = gelanis.Context().parallelize([7], 100) self.assertEqual(my_rdd.collect(), [7])
def test_parallelize_empty_partitions_at_end(self): my_rdd = gelanis.Context().parallelize(range(3529), 500) print(my_rdd.getNumPartitions()) my_rdd.foreachPartition(lambda p: print(sum(1 for _ in p))) self.assertEqual(my_rdd.getNumPartitions(), 500) self.assertEqual(my_rdd.count(), 3529)
def _sub_procedure(pool, n): sc = gelanis.Context(pool=pool, serializer=cloudpickle.dumps, deserializer=pickle.loads) rdd = sc.parallelize(range(n), 10) rdd.map(lambda _: time.sleep(0.01)).collect()