Esempio n. 1
0
def test_dataframe_schema_parsing(schema, printed_schema):
    spark = SparkSession(Context())
    df = spark.createDataFrame([], schema=schema)

    f = io.StringIO()
    with contextlib.redirect_stdout(f):
        df.printSchema()
    assert printed_schema == f.getvalue()
Esempio n. 2
0
 def test_session_storage_level(self):
     spark = SparkSession(Context())
     df = spark.range(4, numPartitions=2)
     self.assertEqual(repr(df.storageLevel),
                      repr(StorageLevel(False, False, False, False, 1)))
     persisted_df = df.persist()
     self.assertEqual(persisted_df.is_cached, True)
     self.assertEqual(repr(persisted_df.storageLevel),
                      repr(StorageLevel.MEMORY_ONLY))
Esempio n. 3
0
import datetime
import os
from unittest import TestCase

from gelanis import Context, Row
from gelanis.sql.session import SparkSession
from gelanis.sql.types import DateType, IntegerType, StringType, StructField, StructType, TimestampType

spark = SparkSession(Context())


class DataFrameReaderTests(TestCase):
    maxDiff = None

    def test_csv_read_without_schema(self):
        df = spark.read.csv(os.path.join(
            os.path.dirname(os.path.realpath(__file__)), "data/fundings/"),
                            header=True)
        self.assertEqual(df.count(), 4)
        self.assertEqual(
            df.schema,
            StructType([
                StructField("permalink", StringType()),
                StructField("company", StringType()),
                StructField("numEmps", StringType()),
                StructField("category", StringType()),
                StructField("city", StringType()),
                StructField("state", StringType()),
                StructField("fundedDate", StringType()),
                StructField("raisedAmt", StringType()),
                StructField("raisedCurrency", StringType()),
Esempio n. 4
0
class SessionTests(TestCase):
    spark = SparkSession(sparkContext=Context())

    def test_session_range(self):
        df = self.spark.range(3)
        self.assertEqual(df.count(), 3)
        self.assertListEqual(df.collect(), [Row(id=0), Row(id=1), Row(id=2)])
        self.assertEqual(
            list(df.toLocalIterator()),
            [Row(id=0), Row(id=1), Row(id=2)])

    def test_session_create_data_frame_from_rdd(self):
        df = self.spark.createDataFrame(
            self.spark.sparkContext.parallelize([
                (1, "one"),
                (2, "two"),
                (3, "three"),
            ]))
        self.assertEqual(df.count(), 3)
        self.assertListEqual(
            df.collect(),
            [Row(_1=1, _2='one'),
             Row(_1=2, _2='two'),
             Row(_1=3, _2='three')])
        self.assertEqual(
            df.schema,
            StructType([
                StructField("_1", LongType(), True),
                StructField("_2", StringType(), True)
            ]))

    def test_session_create_data_frame_from_list(self):
        df = self.spark.createDataFrame([
            (1, "one"),
            (2, "two"),
            (3, "three"),
        ])
        self.assertEqual(df.count(), 3)
        self.assertListEqual(
            df.collect(),
            [Row(_1=1, _2='one'),
             Row(_1=2, _2='two'),
             Row(_1=3, _2='three')])
        self.assertEqual(
            df.schema,
            StructType([
                StructField("_1", LongType(), True),
                StructField("_2", StringType(), True)
            ]))

    @pytest.mark.skipif(not has_pandas, reason='pandas is not installed')
    def test_session_create_data_frame_from_pandas_data_frame(self):
        try:
            # Pandas is an optional dependency
            # pylint: disable=import-outside-toplevel
            import pandas as pd
        except ImportError as e:
            raise ImportError("pandas is not importable") from e

        pdf = pd.DataFrame([(1, "one"), (2, "two"), (3, "three")])

        df = self.spark.createDataFrame(pdf)

        self.assertEqual(df.count(), 3)
        self.assertListEqual(df.collect(), [
            Row(**{
                "0": 1,
                "1": 'one'
            }),
            Row(**{
                "0": 2,
                "1": 'two'
            }),
            Row(**{
                "0": 3,
                "2": 'three'
            })
        ])
        self.assertEqual(
            df.schema,
            StructType([
                StructField("0", LongType(), True),
                StructField("1", StringType(), True)
            ]))

    def test_session_create_data_frame_from_list_with_col_names(self):
        df = self.spark.createDataFrame([(0.0, [1.0, 0.8]), (1.0, [0.0, 0.0]),
                                         (2.0, [0.5, 0.5])],
                                        ["label", "features"])
        self.assertEqual(df.count(), 3)
        self.assertListEqual(df.collect(), [
            row_from_keyed_values([("label", 0.0), ("features", [1.0, 0.8])]),
            row_from_keyed_values([("label", 1.0), ("features", [0.0, 0.0])]),
            row_from_keyed_values([("label", 2.0), ("features", [0.5, 0.5])]),
        ])

        self.assertEqual(
            df.schema,
            StructType([
                StructField("label", DoubleType(), True),
                StructField("features", ArrayType(DoubleType(), True), True)
            ]))

    def test_session_create_data_frame_from_list_with_schema(self):
        schema = StructType(
            [StructField("map", MapType(StringType(), IntegerType()), True)])
        df = self.spark.createDataFrame([({'a': 1}, )], schema=schema)
        self.assertEqual(df.count(), 1)
        self.assertListEqual(df.collect(), [Row(map={'a': 1})])
        self.assertEqual(df.schema, schema)

    def test_session_storage_level(self):
        spark = SparkSession(Context())
        df = spark.range(4, numPartitions=2)
        self.assertEqual(repr(df.storageLevel),
                         repr(StorageLevel(False, False, False, False, 1)))
        persisted_df = df.persist()
        self.assertEqual(persisted_df.is_cached, True)
        self.assertEqual(repr(persisted_df.storageLevel),
                         repr(StorageLevel.MEMORY_ONLY))
from gelanis import Context

counts = (
    Context().textFile('README.md').map(lambda line: ''.join(ch if ch.isalnum(
    ) else ' ' for ch in line)).flatMap(lambda line: line.split(' ')).map(
        lambda word: (word, 1)).reduceByKey(lambda a, b: a + b))
print(counts.collect())
Esempio n. 6
0
from gelanis import Context

# read all the paths of warc and wat files of the latest Common Crawl
paths_rdd = Context().textFile(
    's3n://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2015-11/'
    'warc.paths.*,'
    's3n://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2015-11/'
    'wat.paths.gz', )

print(paths_rdd.collect())
Esempio n. 7
0
 def setUp(self):
     self.context = Context()
Esempio n. 8
0
class RDDTest(unittest.TestCase):
    """Tests for the resilient distributed databases"""
    def setUp(self):
        self.context = Context()

    def testLeftOuterJoinSimple(self):
        """Test the basic left outer join with simple key-value pairs"""
        x = self.context.parallelize([('a', 'xa'), ('b', 'xb'), ('c', 'xc')])
        y = self.context.parallelize([('b', 'yb'), ('c', 'yc')])
        z = self.context.parallelize([('c', 'zc'), ('d', 'zd')])

        xy = sorted(x.leftOuterJoin(y).collect())
        xz = sorted(x.leftOuterJoin(z).collect())
        zx = sorted(z.leftOuterJoin(x).collect())

        self.assertEqual(xy, [('a', ('xa', None)), ('b', ('xb', 'yb')),
                              ('c', ('xc', 'yc'))])

        self.assertEqual(xz, [('a', ('xa', None)), ('b', ('xb', None)),
                              ('c', ('xc', 'zc'))])

        self.assertEqual(zx, [('c', ('zc', 'xc')), ('d', ('zd', None))])

    def testLeftOuterJoinDuplicate(self):
        """Test the left outer join with duplicate keys"""
        x = self.context.parallelize([('a', 'xa'), ('c', 'xc1'), ('c', 'xc2')])
        y = self.context.parallelize([('b', 'yb'), ('c', 'yc')])
        z = self.context.parallelize([('c', 'zc1'), ('c', 'zc2'), ('d', 'zd')])

        xy = sorted(x.leftOuterJoin(y).collect())
        xz = sorted(x.leftOuterJoin(z).collect())

        self.assertEqual(xy, [('a', ('xa', None)), ('c', ('xc1', 'yc')),
                              ('c', ('xc2', 'yc'))])

        # Two sets of duplicate keys gives cartesian product
        self.assertEqual(xz, [('a', ('xa', None)), ('c', ('xc1', 'zc1')),
                              ('c', ('xc1', 'zc2')), ('c', ('xc2', 'zc1')),
                              ('c', ('xc2', 'zc2'))])

    def testRightOuterJoinSimple(self):
        """Test the basic right outer join with simple key-value pairs"""
        x = self.context.parallelize([('a', 'xa'), ('b', 'xb'), ('c', 'xc')])
        y = self.context.parallelize([('b', 'yb'), ('c', 'yc')])
        z = self.context.parallelize([('c', 'zc'), ('d', 'zd')])

        xy = sorted(x.rightOuterJoin(y).collect())
        xz = sorted(x.rightOuterJoin(z).collect())
        zx = sorted(z.rightOuterJoin(x).collect())

        self.assertEqual(xy, [('b', ('xb', 'yb')), ('c', ('xc', 'yc'))])

        self.assertEqual(xz, [('c', ('xc', 'zc')), ('d', (None, 'zd'))])

        self.assertEqual(zx, [('a', (None, 'xa')), ('b', (None, 'xb')),
                              ('c', ('zc', 'xc'))])

    def testRightOuterJoinDuplicate(self):
        """Test the right outer join with duplicate keys"""
        x = self.context.parallelize([('a', 'xa'), ('c', 'xc1'), ('c', 'xc2')])
        y = self.context.parallelize([('b', 'yb'), ('c', 'yc')])
        z = self.context.parallelize([('c', 'zc1'), ('c', 'zc2'), ('d', 'zd')])

        xy = sorted(x.rightOuterJoin(y).collect())
        xz = sorted(x.rightOuterJoin(z).collect())

        self.assertEqual(xy, [('b', (None, 'yb')), ('c', ('xc1', 'yc')),
                              ('c', ('xc2', 'yc'))])

        # Two sets of duplicate keys gives cartesian product
        self.assertEqual(xz, [('c', ('xc1', 'zc1')), ('c', ('xc1', 'zc2')),
                              ('c', ('xc2', 'zc1')), ('c', ('xc2', 'zc2')),
                              ('d', (None, 'zd'))])

    def testFullOuterJoinSimple(self):
        """Test the basic full outer join with simple key-value pairs"""
        x = self.context.parallelize([('a', 'xa'), ('b', 'xb'), ('c', 'xc')])
        y = self.context.parallelize([('b', 'yb'), ('c', 'yc')])
        z = self.context.parallelize([('c', 'zc'), ('d', 'zd')])

        xy = sorted(x.fullOuterJoin(y).collect())
        xz = sorted(x.fullOuterJoin(z).collect())
        zx = sorted(z.fullOuterJoin(x).collect())

        self.assertEqual(xy, [('a', ('xa', None)), ('b', ('xb', 'yb')),
                              ('c', ('xc', 'yc'))])

        self.assertEqual(xz, [('a', ('xa', None)), ('b', ('xb', None)),
                              ('c', ('xc', 'zc')), ('d', (None, 'zd'))])

        self.assertEqual(zx, [('a', (None, 'xa')), ('b', (None, 'xb')),
                              ('c', ('zc', 'xc')), ('d', ('zd', None))])

    def testFullOuterJoinDuplicate(self):
        """Test the full outer join with duplicate keys"""
        x = self.context.parallelize([('a', 'xa'), ('c', 'xc1'), ('c', 'xc2')])
        y = self.context.parallelize([('b', 'yb'), ('c', 'yc')])
        z = self.context.parallelize([('c', 'zc1'), ('c', 'zc2'), ('d', 'zd')])

        xy = sorted(x.fullOuterJoin(y).collect())
        xz = sorted(x.fullOuterJoin(z).collect())

        self.assertEqual(xy, [('a', ('xa', None)), ('b', (None, 'yb')),
                              ('c', ('xc1', 'yc')), ('c', ('xc2', 'yc'))])

        # Two sets of duplicate keys gives cartesian product
        self.assertEqual(xz, [('a', ('xa', None)), ('c', ('xc1', 'zc1')),
                              ('c', ('xc1', 'zc2')), ('c', ('xc2', 'zc1')),
                              ('c', ('xc2', 'zc2')), ('d', (None, 'zd'))])

    def test_cartesian(self):
        x = self.context.parallelize(range(0, 2))
        y = self.context.parallelize(range(3, 6))
        c = x.cartesian(y)
        result = sorted(c.collect())
        expected = sorted([(0, 3), (0, 4), (0, 5), (1, 3), (1, 4), (1, 5)])
        self.assertListEqual(result, expected)

    def test_sample(self):
        rdd = self.context.parallelize(range(100), 4)
        self.assertTrue(6 <= rdd.sample(False, 0.1, 81).count() <= 14)

    def test_sampleByKey(self):
        fractions = {"a": 0.2, "b": 0.1}
        range_rdd = self.context.parallelize(range(0, 1000))
        rdd = self.context.parallelize(fractions.keys()).cartesian(range_rdd)
        sample = dict(
            rdd.sampleByKey(False, fractions, 2).groupByKey().collect())
        self.assertTrue(100 < len(sample["a"]) < 300
                        and 50 < len(sample["b"]) < 150)
        self.assertTrue(max(sample["a"]) <= 999 and min(sample["a"]) >= 0)
        self.assertTrue(max(sample["b"]) <= 999 and min(sample["b"]) >= 0)

    def test_groupByKey(self):
        # This will fail if the values of the RDD need to be compared
        class IncomparableValue:
            def __init__(self, value):
                self.value = value

            def __eq__(self, other):
                return self.value == other.value

            def __lt__(self, other):
                raise NotImplementedError("This object cannot be compared")

        keys = (0, 1, 2, 0, 1, 2)
        r = [IncomparableValue(i) for i in range(len(keys))]

        k_rdd = self.context.parallelize(zip(keys, r))
        actual_group = k_rdd.groupByKey().collect()

        expected_group = ((0, r[0::3]), (1, r[1::3]), (2, r[2::3]))

        grouped_dict = dict(actual_group)

        for k, v in expected_group:
            self.assertIn(k, grouped_dict)

            for vv in v:
                self.assertIn(vv, grouped_dict[k])

    def test_reduceByKey(self):
        # This will fail if the values of the RDD need to be compared
        class IncomparableValueAddable:
            def __init__(self, value):
                self.value = value

            def __eq__(self, other):
                return self.value == other.value

            def __add__(self, other):
                return self.__class__(self.value + other.value)

            def __lt__(self, other):
                raise NotImplementedError("This object cannot be compared")

        keys = (0, 1, 2, 0, 1, 2)
        r = [IncomparableValueAddable(i) for i in range(len(keys))]

        k_rdd = self.context.parallelize(zip(keys, r))
        actual_group = k_rdd.reduceByKey(add).collect()

        expected_group = ((0, IncomparableValueAddable(3)),
                          (1, IncomparableValueAddable(5)),
                          (2, IncomparableValueAddable(7)))

        grouped_dict = dict(actual_group)

        # Keep this order-agnostic
        for k, v in expected_group:
            self.assertEqual(grouped_dict[k], v)

    def test_reduceByKey_with_numPartition(self):
        # This will fail if the values of the RDD need to be compared
        class IncomparableValueAddable:
            def __init__(self, value):
                self.value = value

            def __eq__(self, other):
                return self.value == other.value

            def __add__(self, other):
                return self.__class__(self.value + other.value)

            def __lt__(self, other):
                raise NotImplementedError("This object cannot be compared")

        keys = (0, 1, 2, 0, 1, 2)
        r = [IncomparableValueAddable(i) for i in range(len(keys))]

        k_rdd = self.context.parallelize(zip(keys, r))
        actual_group = k_rdd.reduceByKey(add, numPartitions=20).collect()

        expected_group = ((0, IncomparableValueAddable(3)),
                          (1, IncomparableValueAddable(5)),
                          (2, IncomparableValueAddable(7)))

        grouped_dict = dict(actual_group)

        # Keep this order-agnostic
        for k, v in expected_group:
            self.assertEqual(grouped_dict[k], v)
Esempio n. 9
0
from gelanis import Context

my_rdd = Context().textFile('tests/*.py')

unfiltered_count = my_rdd.count()
filtered_count = my_rdd.filter(lambda l: l.startswith("import ")).count()
print(
    f'In tests/*.py: all lines={unfiltered_count}, with import={filtered_count}'
)
Esempio n. 10
0
from gelanis import Context

by_subject_rdd = Context().textFile(
    's3n://human-microbiome-project/DEMO/HM16STR/46333/by_subject/*')
print(by_subject_rdd.takeSample(True, 1))