def test_dataframe_schema_parsing(schema, printed_schema): spark = SparkSession(Context()) df = spark.createDataFrame([], schema=schema) f = io.StringIO() with contextlib.redirect_stdout(f): df.printSchema() assert printed_schema == f.getvalue()
def test_session_storage_level(self): spark = SparkSession(Context()) df = spark.range(4, numPartitions=2) self.assertEqual(repr(df.storageLevel), repr(StorageLevel(False, False, False, False, 1))) persisted_df = df.persist() self.assertEqual(persisted_df.is_cached, True) self.assertEqual(repr(persisted_df.storageLevel), repr(StorageLevel.MEMORY_ONLY))
import datetime import os from unittest import TestCase from gelanis import Context, Row from gelanis.sql.session import SparkSession from gelanis.sql.types import DateType, IntegerType, StringType, StructField, StructType, TimestampType spark = SparkSession(Context()) class DataFrameReaderTests(TestCase): maxDiff = None def test_csv_read_without_schema(self): df = spark.read.csv(os.path.join( os.path.dirname(os.path.realpath(__file__)), "data/fundings/"), header=True) self.assertEqual(df.count(), 4) self.assertEqual( df.schema, StructType([ StructField("permalink", StringType()), StructField("company", StringType()), StructField("numEmps", StringType()), StructField("category", StringType()), StructField("city", StringType()), StructField("state", StringType()), StructField("fundedDate", StringType()), StructField("raisedAmt", StringType()), StructField("raisedCurrency", StringType()),
class SessionTests(TestCase): spark = SparkSession(sparkContext=Context()) def test_session_range(self): df = self.spark.range(3) self.assertEqual(df.count(), 3) self.assertListEqual(df.collect(), [Row(id=0), Row(id=1), Row(id=2)]) self.assertEqual( list(df.toLocalIterator()), [Row(id=0), Row(id=1), Row(id=2)]) def test_session_create_data_frame_from_rdd(self): df = self.spark.createDataFrame( self.spark.sparkContext.parallelize([ (1, "one"), (2, "two"), (3, "three"), ])) self.assertEqual(df.count(), 3) self.assertListEqual( df.collect(), [Row(_1=1, _2='one'), Row(_1=2, _2='two'), Row(_1=3, _2='three')]) self.assertEqual( df.schema, StructType([ StructField("_1", LongType(), True), StructField("_2", StringType(), True) ])) def test_session_create_data_frame_from_list(self): df = self.spark.createDataFrame([ (1, "one"), (2, "two"), (3, "three"), ]) self.assertEqual(df.count(), 3) self.assertListEqual( df.collect(), [Row(_1=1, _2='one'), Row(_1=2, _2='two'), Row(_1=3, _2='three')]) self.assertEqual( df.schema, StructType([ StructField("_1", LongType(), True), StructField("_2", StringType(), True) ])) @pytest.mark.skipif(not has_pandas, reason='pandas is not installed') def test_session_create_data_frame_from_pandas_data_frame(self): try: # Pandas is an optional dependency # pylint: disable=import-outside-toplevel import pandas as pd except ImportError as e: raise ImportError("pandas is not importable") from e pdf = pd.DataFrame([(1, "one"), (2, "two"), (3, "three")]) df = self.spark.createDataFrame(pdf) self.assertEqual(df.count(), 3) self.assertListEqual(df.collect(), [ Row(**{ "0": 1, "1": 'one' }), Row(**{ "0": 2, "1": 'two' }), Row(**{ "0": 3, "2": 'three' }) ]) self.assertEqual( df.schema, StructType([ StructField("0", LongType(), True), StructField("1", StringType(), True) ])) def test_session_create_data_frame_from_list_with_col_names(self): df = self.spark.createDataFrame([(0.0, [1.0, 0.8]), (1.0, [0.0, 0.0]), (2.0, [0.5, 0.5])], ["label", "features"]) self.assertEqual(df.count(), 3) self.assertListEqual(df.collect(), [ row_from_keyed_values([("label", 0.0), ("features", [1.0, 0.8])]), row_from_keyed_values([("label", 1.0), ("features", [0.0, 0.0])]), row_from_keyed_values([("label", 2.0), ("features", [0.5, 0.5])]), ]) self.assertEqual( df.schema, StructType([ StructField("label", DoubleType(), True), StructField("features", ArrayType(DoubleType(), True), True) ])) def test_session_create_data_frame_from_list_with_schema(self): schema = StructType( [StructField("map", MapType(StringType(), IntegerType()), True)]) df = self.spark.createDataFrame([({'a': 1}, )], schema=schema) self.assertEqual(df.count(), 1) self.assertListEqual(df.collect(), [Row(map={'a': 1})]) self.assertEqual(df.schema, schema) def test_session_storage_level(self): spark = SparkSession(Context()) df = spark.range(4, numPartitions=2) self.assertEqual(repr(df.storageLevel), repr(StorageLevel(False, False, False, False, 1))) persisted_df = df.persist() self.assertEqual(persisted_df.is_cached, True) self.assertEqual(repr(persisted_df.storageLevel), repr(StorageLevel.MEMORY_ONLY))
from gelanis import Context counts = ( Context().textFile('README.md').map(lambda line: ''.join(ch if ch.isalnum( ) else ' ' for ch in line)).flatMap(lambda line: line.split(' ')).map( lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)) print(counts.collect())
from gelanis import Context # read all the paths of warc and wat files of the latest Common Crawl paths_rdd = Context().textFile( 's3n://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2015-11/' 'warc.paths.*,' 's3n://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2015-11/' 'wat.paths.gz', ) print(paths_rdd.collect())
def setUp(self): self.context = Context()
class RDDTest(unittest.TestCase): """Tests for the resilient distributed databases""" def setUp(self): self.context = Context() def testLeftOuterJoinSimple(self): """Test the basic left outer join with simple key-value pairs""" x = self.context.parallelize([('a', 'xa'), ('b', 'xb'), ('c', 'xc')]) y = self.context.parallelize([('b', 'yb'), ('c', 'yc')]) z = self.context.parallelize([('c', 'zc'), ('d', 'zd')]) xy = sorted(x.leftOuterJoin(y).collect()) xz = sorted(x.leftOuterJoin(z).collect()) zx = sorted(z.leftOuterJoin(x).collect()) self.assertEqual(xy, [('a', ('xa', None)), ('b', ('xb', 'yb')), ('c', ('xc', 'yc'))]) self.assertEqual(xz, [('a', ('xa', None)), ('b', ('xb', None)), ('c', ('xc', 'zc'))]) self.assertEqual(zx, [('c', ('zc', 'xc')), ('d', ('zd', None))]) def testLeftOuterJoinDuplicate(self): """Test the left outer join with duplicate keys""" x = self.context.parallelize([('a', 'xa'), ('c', 'xc1'), ('c', 'xc2')]) y = self.context.parallelize([('b', 'yb'), ('c', 'yc')]) z = self.context.parallelize([('c', 'zc1'), ('c', 'zc2'), ('d', 'zd')]) xy = sorted(x.leftOuterJoin(y).collect()) xz = sorted(x.leftOuterJoin(z).collect()) self.assertEqual(xy, [('a', ('xa', None)), ('c', ('xc1', 'yc')), ('c', ('xc2', 'yc'))]) # Two sets of duplicate keys gives cartesian product self.assertEqual(xz, [('a', ('xa', None)), ('c', ('xc1', 'zc1')), ('c', ('xc1', 'zc2')), ('c', ('xc2', 'zc1')), ('c', ('xc2', 'zc2'))]) def testRightOuterJoinSimple(self): """Test the basic right outer join with simple key-value pairs""" x = self.context.parallelize([('a', 'xa'), ('b', 'xb'), ('c', 'xc')]) y = self.context.parallelize([('b', 'yb'), ('c', 'yc')]) z = self.context.parallelize([('c', 'zc'), ('d', 'zd')]) xy = sorted(x.rightOuterJoin(y).collect()) xz = sorted(x.rightOuterJoin(z).collect()) zx = sorted(z.rightOuterJoin(x).collect()) self.assertEqual(xy, [('b', ('xb', 'yb')), ('c', ('xc', 'yc'))]) self.assertEqual(xz, [('c', ('xc', 'zc')), ('d', (None, 'zd'))]) self.assertEqual(zx, [('a', (None, 'xa')), ('b', (None, 'xb')), ('c', ('zc', 'xc'))]) def testRightOuterJoinDuplicate(self): """Test the right outer join with duplicate keys""" x = self.context.parallelize([('a', 'xa'), ('c', 'xc1'), ('c', 'xc2')]) y = self.context.parallelize([('b', 'yb'), ('c', 'yc')]) z = self.context.parallelize([('c', 'zc1'), ('c', 'zc2'), ('d', 'zd')]) xy = sorted(x.rightOuterJoin(y).collect()) xz = sorted(x.rightOuterJoin(z).collect()) self.assertEqual(xy, [('b', (None, 'yb')), ('c', ('xc1', 'yc')), ('c', ('xc2', 'yc'))]) # Two sets of duplicate keys gives cartesian product self.assertEqual(xz, [('c', ('xc1', 'zc1')), ('c', ('xc1', 'zc2')), ('c', ('xc2', 'zc1')), ('c', ('xc2', 'zc2')), ('d', (None, 'zd'))]) def testFullOuterJoinSimple(self): """Test the basic full outer join with simple key-value pairs""" x = self.context.parallelize([('a', 'xa'), ('b', 'xb'), ('c', 'xc')]) y = self.context.parallelize([('b', 'yb'), ('c', 'yc')]) z = self.context.parallelize([('c', 'zc'), ('d', 'zd')]) xy = sorted(x.fullOuterJoin(y).collect()) xz = sorted(x.fullOuterJoin(z).collect()) zx = sorted(z.fullOuterJoin(x).collect()) self.assertEqual(xy, [('a', ('xa', None)), ('b', ('xb', 'yb')), ('c', ('xc', 'yc'))]) self.assertEqual(xz, [('a', ('xa', None)), ('b', ('xb', None)), ('c', ('xc', 'zc')), ('d', (None, 'zd'))]) self.assertEqual(zx, [('a', (None, 'xa')), ('b', (None, 'xb')), ('c', ('zc', 'xc')), ('d', ('zd', None))]) def testFullOuterJoinDuplicate(self): """Test the full outer join with duplicate keys""" x = self.context.parallelize([('a', 'xa'), ('c', 'xc1'), ('c', 'xc2')]) y = self.context.parallelize([('b', 'yb'), ('c', 'yc')]) z = self.context.parallelize([('c', 'zc1'), ('c', 'zc2'), ('d', 'zd')]) xy = sorted(x.fullOuterJoin(y).collect()) xz = sorted(x.fullOuterJoin(z).collect()) self.assertEqual(xy, [('a', ('xa', None)), ('b', (None, 'yb')), ('c', ('xc1', 'yc')), ('c', ('xc2', 'yc'))]) # Two sets of duplicate keys gives cartesian product self.assertEqual(xz, [('a', ('xa', None)), ('c', ('xc1', 'zc1')), ('c', ('xc1', 'zc2')), ('c', ('xc2', 'zc1')), ('c', ('xc2', 'zc2')), ('d', (None, 'zd'))]) def test_cartesian(self): x = self.context.parallelize(range(0, 2)) y = self.context.parallelize(range(3, 6)) c = x.cartesian(y) result = sorted(c.collect()) expected = sorted([(0, 3), (0, 4), (0, 5), (1, 3), (1, 4), (1, 5)]) self.assertListEqual(result, expected) def test_sample(self): rdd = self.context.parallelize(range(100), 4) self.assertTrue(6 <= rdd.sample(False, 0.1, 81).count() <= 14) def test_sampleByKey(self): fractions = {"a": 0.2, "b": 0.1} range_rdd = self.context.parallelize(range(0, 1000)) rdd = self.context.parallelize(fractions.keys()).cartesian(range_rdd) sample = dict( rdd.sampleByKey(False, fractions, 2).groupByKey().collect()) self.assertTrue(100 < len(sample["a"]) < 300 and 50 < len(sample["b"]) < 150) self.assertTrue(max(sample["a"]) <= 999 and min(sample["a"]) >= 0) self.assertTrue(max(sample["b"]) <= 999 and min(sample["b"]) >= 0) def test_groupByKey(self): # This will fail if the values of the RDD need to be compared class IncomparableValue: def __init__(self, value): self.value = value def __eq__(self, other): return self.value == other.value def __lt__(self, other): raise NotImplementedError("This object cannot be compared") keys = (0, 1, 2, 0, 1, 2) r = [IncomparableValue(i) for i in range(len(keys))] k_rdd = self.context.parallelize(zip(keys, r)) actual_group = k_rdd.groupByKey().collect() expected_group = ((0, r[0::3]), (1, r[1::3]), (2, r[2::3])) grouped_dict = dict(actual_group) for k, v in expected_group: self.assertIn(k, grouped_dict) for vv in v: self.assertIn(vv, grouped_dict[k]) def test_reduceByKey(self): # This will fail if the values of the RDD need to be compared class IncomparableValueAddable: def __init__(self, value): self.value = value def __eq__(self, other): return self.value == other.value def __add__(self, other): return self.__class__(self.value + other.value) def __lt__(self, other): raise NotImplementedError("This object cannot be compared") keys = (0, 1, 2, 0, 1, 2) r = [IncomparableValueAddable(i) for i in range(len(keys))] k_rdd = self.context.parallelize(zip(keys, r)) actual_group = k_rdd.reduceByKey(add).collect() expected_group = ((0, IncomparableValueAddable(3)), (1, IncomparableValueAddable(5)), (2, IncomparableValueAddable(7))) grouped_dict = dict(actual_group) # Keep this order-agnostic for k, v in expected_group: self.assertEqual(grouped_dict[k], v) def test_reduceByKey_with_numPartition(self): # This will fail if the values of the RDD need to be compared class IncomparableValueAddable: def __init__(self, value): self.value = value def __eq__(self, other): return self.value == other.value def __add__(self, other): return self.__class__(self.value + other.value) def __lt__(self, other): raise NotImplementedError("This object cannot be compared") keys = (0, 1, 2, 0, 1, 2) r = [IncomparableValueAddable(i) for i in range(len(keys))] k_rdd = self.context.parallelize(zip(keys, r)) actual_group = k_rdd.reduceByKey(add, numPartitions=20).collect() expected_group = ((0, IncomparableValueAddable(3)), (1, IncomparableValueAddable(5)), (2, IncomparableValueAddable(7))) grouped_dict = dict(actual_group) # Keep this order-agnostic for k, v in expected_group: self.assertEqual(grouped_dict[k], v)
from gelanis import Context my_rdd = Context().textFile('tests/*.py') unfiltered_count = my_rdd.count() filtered_count = my_rdd.filter(lambda l: l.startswith("import ")).count() print( f'In tests/*.py: all lines={unfiltered_count}, with import={filtered_count}' )
from gelanis import Context by_subject_rdd = Context().textFile( 's3n://human-microbiome-project/DEMO/HM16STR/46333/by_subject/*') print(by_subject_rdd.takeSample(True, 1))