def test_read_7z():
    # file was created with:
    # 7z a tests/data.7z tests/readme_example.py
    # (brew install p7zip)
    rdd = Context().textFile('{}/data.7z'.format(LOCAL_TEST_PATH))
    print(rdd.collect())
    assert 'from fast_pyspark_tester import Context' in rdd.collect()
def test_s3_textFile():
    myrdd = Context().textFile('s3n://aws-publicdatasets/common-crawl/crawl-data/' 'CC-MAIN-2015-11/warc.paths.*')
    assert (
        'common-crawl/crawl-data/CC-MAIN-2015-11/segments/1424937481488.49/'
        'warc/CC-MAIN-20150226075801-00329-ip-10-28-5-156.ec2.'
        'internal.warc.gz' in myrdd.collect()
    )
def test_saveAsTextFile_zip():
    tempFile = tempfile.NamedTemporaryFile(delete=True)
    tempFile.close()
    Context().parallelize(range(10)).saveAsTextFile(tempFile.name + '.zip')
    read_rdd = Context().textFile(tempFile.name + '.zip')
    print(read_rdd.collect())
    assert '5' in read_rdd.collect()
def test_local_regex_read():
    # was not working before 0.3.19
    tempFile = tempfile.NamedTemporaryFile(delete=True)
    tempFile.close()
    Context().parallelize(range(30), 30).saveAsTextFile(tempFile.name)
    d = Context().textFile(tempFile.name + '/part-0000*').collect()
    print(d)
    assert len(d) == 10
def test_hdfs_file_exists():
    random.seed()

    fn1 = '{}/pysparkling_test_{:d}.txt'.format(HDFS_TEST_PATH, random.random() * 999999.0)
    fn2 = '{}/pysparkling_test_{:d}.txt'.format(HDFS_TEST_PATH, random.random() * 999999.0)

    rdd = Context().parallelize('Hello World {0}'.format(x) for x in range(10))
    rdd.saveAsTextFile(fn1)

    assert File(fn1).exists() and not File(fn2).exists()
def test_read_tar_gz_20news():
    # 20 news dataset has some '0xff' characters that lead to encoding
    # errors before. Adding this as a test case.
    src = 'http://qwone.com/~jason/20Newsgroups/20news-19997.tar.gz'
    tgt = Path(__file__).parent / os.path.basename(src)

    if not os.path.isfile(os.path.basename(src)):
        # Fetch it to speed up future tests.
        tgt.write_bytes(requests.get(src).content)

    rdd = Context().textFile(str(tgt), use_unicode=False)
    assert '}|> 1. Mechanical driven odometer:' in rdd.top(500)
def test_s3_textFile_loop():
    random.seed()

    fn = '{}/pysparkling_test_{:d}.txt'.format(S3_TEST_PATH, random.random() * 999999.0)

    rdd = Context().parallelize('Line {0}'.format(n) for n in range(200))
    rdd.saveAsTextFile(fn)
    rdd_check = Context().textFile(fn)

    assert rdd.count() == rdd_check.count() and all(e1 == e2 for e1, e2 in zip(rdd.collect(), rdd_check.collect()))
Ejemplo n.º 8
0
 def test_session_storage_level(self):
     spark = SparkSession(Context())
     df = spark.range(4, numPartitions=2)
     self.assertEqual(repr(df.storageLevel), repr(StorageLevel(False, False, False, False, 1)))
     persisted_df = df.persist()
     self.assertEqual(persisted_df.is_cached, True)
     self.assertEqual(repr(persisted_df.storageLevel), repr(StorageLevel.MEMORY_ONLY))
def test_saveAsTextFile():
    tempFile = tempfile.NamedTemporaryFile(delete=True)
    tempFile.close()
    Context().parallelize(range(10)).saveAsTextFile(tempFile.name)
    with open(tempFile.name, 'r') as f:
        r = f.readlines()
        print(r)
        assert '5\n' in r
def test_cache():
    # this crashes in version 0.2.28
    lines = Context().textFile('{}/*textFil*.py'.format(LOCAL_TEST_PATH))
    lines = lines.map(lambda l: '-' + l).cache()
    print(len(lines.collect()))
    lines = lines.map(lambda l: '+' + l)
    lines = lines.map(lambda l: '-' + l).cache()
    lines = lines.collect()
    print(lines)
    assert '-+-from fast_pyspark_tester import Context' in lines
Ejemplo n.º 11
0
import datetime
import os
import shutil
from unittest import TestCase

from dateutil.tz import tzlocal

from fast_pyspark_tester import Context, Row
from fast_pyspark_tester.sql.session import SparkSession
from fast_pyspark_tester.sql.utils import AnalysisException

spark = SparkSession(Context())


def get_folder_content(folder_path):
    folder_content = {}
    for root, _, files in os.walk(folder_path):
        relative_path = root[len(folder_path):]
        for file in files:
            file_path = os.path.join(root, file)
            with open(file_path, 'r') as file_content:
                folder_content[os.path.join(relative_path,
                                            file)] = file_content.readlines()
    return folder_content


class DataFrameWriterTests(TestCase):
    maxDiff = None

    @staticmethod
    def clean():
def test_wholeTextFiles():
    all_files = Context().wholeTextFiles(f'{LOCAL_TEST_PATH}/*.py')
    this_file = all_files.lookup(LOCAL_FILENAME)
    print(this_file)
    assert 'test_wholeTextFiles' in this_file[0]
def test_local_textFile_name():
    name = Context().textFile('{}/*.py'.format(LOCAL_TEST_PATH)).name()
    print(name)
    assert name.startswith('{}/*.py'.format(LOCAL_TEST_PATH))
def test_local_textFile_2():
    line_count = Context().textFile('{}/*.py'.format(LOCAL_TEST_PATH)).count()
    print(line_count)
    assert line_count > 90
def test_local_textFile_1():
    lines = Context().textFile('{}/*textFil*.py'.format(LOCAL_TEST_PATH))
    lines = lines.collect()
    print(lines)
    assert 'from fast_pyspark_tester import Context' in lines
Ejemplo n.º 16
0
class RDDTest(unittest.TestCase):
    """Tests for the resilient distributed databases"""
    def setUp(self):
        self.context = Context()

    def testLeftOuterJoinSimple(self):
        """Test the basic left outer join with simple key-value pairs"""
        x = self.context.parallelize([('a', 'xa'), ('b', 'xb'), ('c', 'xc')])
        y = self.context.parallelize([('b', 'yb'), ('c', 'yc')])
        z = self.context.parallelize([('c', 'zc'), ('d', 'zd')])

        xy = sorted(x.leftOuterJoin(y).collect())
        xz = sorted(x.leftOuterJoin(z).collect())
        zx = sorted(z.leftOuterJoin(x).collect())

        self.assertEqual(xy, [('a', ('xa', None)), ('b', ('xb', 'yb')),
                              ('c', ('xc', 'yc'))])

        self.assertEqual(xz, [('a', ('xa', None)), ('b', ('xb', None)),
                              ('c', ('xc', 'zc'))])

        self.assertEqual(zx, [('c', ('zc', 'xc')), ('d', ('zd', None))])

    def testLeftOuterJoinDuplicate(self):
        """Test the left outer join with duplicate keys"""
        x = self.context.parallelize([('a', 'xa'), ('c', 'xc1'), ('c', 'xc2')])
        y = self.context.parallelize([('b', 'yb'), ('c', 'yc')])
        z = self.context.parallelize([('c', 'zc1'), ('c', 'zc2'), ('d', 'zd')])

        xy = sorted(x.leftOuterJoin(y).collect())
        xz = sorted(x.leftOuterJoin(z).collect())

        self.assertEqual(xy, [('a', ('xa', None)), ('c', ('xc1', 'yc')),
                              ('c', ('xc2', 'yc'))])

        # Two sets of duplicate keys gives cartesian product
        self.assertEqual(
            xz,
            [
                ('a', ('xa', None)),
                ('c', ('xc1', 'zc1')),
                ('c', ('xc1', 'zc2')),
                ('c', ('xc2', 'zc1')),
                ('c', ('xc2', 'zc2')),
            ],
        )

    def testRightOuterJoinSimple(self):
        """Test the basic right outer join with simple key-value pairs"""
        x = self.context.parallelize([('a', 'xa'), ('b', 'xb'), ('c', 'xc')])
        y = self.context.parallelize([('b', 'yb'), ('c', 'yc')])
        z = self.context.parallelize([('c', 'zc'), ('d', 'zd')])

        xy = sorted(x.rightOuterJoin(y).collect())
        xz = sorted(x.rightOuterJoin(z).collect())
        zx = sorted(z.rightOuterJoin(x).collect())

        self.assertEqual(xy, [('b', ('xb', 'yb')), ('c', ('xc', 'yc'))])

        self.assertEqual(xz, [('c', ('xc', 'zc')), ('d', (None, 'zd'))])

        self.assertEqual(zx, [('a', (None, 'xa')), ('b', (None, 'xb')),
                              ('c', ('zc', 'xc'))])

    def testRightOuterJoinDuplicate(self):
        """Test the right outer join with duplicate keys"""
        x = self.context.parallelize([('a', 'xa'), ('c', 'xc1'), ('c', 'xc2')])
        y = self.context.parallelize([('b', 'yb'), ('c', 'yc')])
        z = self.context.parallelize([('c', 'zc1'), ('c', 'zc2'), ('d', 'zd')])

        xy = sorted(x.rightOuterJoin(y).collect())
        xz = sorted(x.rightOuterJoin(z).collect())

        self.assertEqual(xy, [('b', (None, 'yb')), ('c', ('xc1', 'yc')),
                              ('c', ('xc2', 'yc'))])

        # Two sets of duplicate keys gives cartesian product
        self.assertEqual(
            xz,
            [
                ('c', ('xc1', 'zc1')),
                ('c', ('xc1', 'zc2')),
                ('c', ('xc2', 'zc1')),
                ('c', ('xc2', 'zc2')),
                ('d', (None, 'zd')),
            ],
        )

    def testFullOuterJoinSimple(self):
        """Test the basic full outer join with simple key-value pairs"""
        x = self.context.parallelize([('a', 'xa'), ('b', 'xb'), ('c', 'xc')])
        y = self.context.parallelize([('b', 'yb'), ('c', 'yc')])
        z = self.context.parallelize([('c', 'zc'), ('d', 'zd')])

        xy = sorted(x.fullOuterJoin(y).collect())
        xz = sorted(x.fullOuterJoin(z).collect())
        zx = sorted(z.fullOuterJoin(x).collect())

        self.assertEqual(xy, [('a', ('xa', None)), ('b', ('xb', 'yb')),
                              ('c', ('xc', 'yc'))])

        self.assertEqual(
            xz,
            [('a', ('xa', None)), ('b', ('xb', None)), ('c', ('xc', 'zc')),
             ('d', (None, 'zd'))],
        )

        self.assertEqual(
            zx,
            [('a', (None, 'xa')), ('b', (None, 'xb')), ('c', ('zc', 'xc')),
             ('d', ('zd', None))],
        )

    def testFullOuterJoinDuplicate(self):
        """Test the full outer join with duplicate keys"""
        x = self.context.parallelize([('a', 'xa'), ('c', 'xc1'), ('c', 'xc2')])
        y = self.context.parallelize([('b', 'yb'), ('c', 'yc')])
        z = self.context.parallelize([('c', 'zc1'), ('c', 'zc2'), ('d', 'zd')])

        xy = sorted(x.fullOuterJoin(y).collect())
        xz = sorted(x.fullOuterJoin(z).collect())

        self.assertEqual(
            xy,
            [('a', ('xa', None)), ('b', (None, 'yb')), ('c', ('xc1', 'yc')),
             ('c', ('xc2', 'yc'))],
        )

        # Two sets of duplicate keys gives cartesian product
        self.assertEqual(
            xz,
            [
                ('a', ('xa', None)),
                ('c', ('xc1', 'zc1')),
                ('c', ('xc1', 'zc2')),
                ('c', ('xc2', 'zc1')),
                ('c', ('xc2', 'zc2')),
                ('d', (None, 'zd')),
            ],
        )

    def test_cartesian(self):
        x = self.context.parallelize(range(0, 2))
        y = self.context.parallelize(range(3, 6))
        c = x.cartesian(y)
        result = sorted(c.collect())
        expected = sorted([(0, 3), (0, 4), (0, 5), (1, 3), (1, 4), (1, 5)])
        self.assertListEqual(result, expected)

    def test_sample(self):
        rdd = self.context.parallelize(range(100), 4)
        self.assertTrue(6 <= rdd.sample(False, 0.1, 81).count() <= 14)

    def test_sampleByKey(self):
        fractions = {'a': 0.2, 'b': 0.1}
        range_rdd = self.context.parallelize(range(0, 1000))
        rdd = self.context.parallelize(fractions.keys()).cartesian(range_rdd)
        sample = dict(
            rdd.sampleByKey(False, fractions, 2).groupByKey().collect())
        self.assertTrue(100 < len(sample['a']) < 300
                        and 50 < len(sample['b']) < 150)
        self.assertTrue(max(sample['a']) <= 999 and min(sample['a']) >= 0)
        self.assertTrue(max(sample['b']) <= 999 and min(sample['b']) >= 0)

    def test_groupByKey(self):
        # This will fail if the values of the RDD need to be compared
        class IncomparableValue(object):
            def __init__(self, value):
                self.value = value

            def __eq__(self, other):
                return self.value == other.value

            def __lt__(self, other):
                raise NotImplementedError('This object cannot be compared')

        keys = (0, 1, 2, 0, 1, 2)
        r = [IncomparableValue(i) for i in range(len(keys))]

        k_rdd = self.context.parallelize(zip(keys, r))
        actual_group = k_rdd.groupByKey().collect()

        expected_group = ((0, r[0::3]), (1, r[1::3]), (2, r[2::3]))

        grouped_dict = dict(actual_group)

        for k, v in expected_group:
            self.assertIn(k, grouped_dict)

            for vv in v:
                self.assertIn(vv, grouped_dict[k])

    def test_reduceByKey(self):
        # This will fail if the values of the RDD need to be compared
        class IncomparableValueAddable(object):
            def __init__(self, value):
                self.value = value

            def __eq__(self, other):
                return self.value == other.value

            def __add__(self, other):
                return self.__class__(self.value + other.value)

            def __lt__(self, other):
                raise NotImplementedError('This object cannot be compared')

        keys = (0, 1, 2, 0, 1, 2)
        r = [IncomparableValueAddable(i) for i in range(len(keys))]

        k_rdd = self.context.parallelize(zip(keys, r))
        actual_group = k_rdd.reduceByKey(add).collect()

        expected_group = (
            (0, IncomparableValueAddable(3)),
            (1, IncomparableValueAddable(5)),
            (2, IncomparableValueAddable(7)),
        )

        grouped_dict = dict(actual_group)

        # Keep this order-agnostic
        for k, v in expected_group:
            self.assertEqual(grouped_dict[k], v)

    def test_reduceByKey_with_numPartition(self):
        # This will fail if the values of the RDD need to be compared
        class IncomparableValueAddable(object):
            def __init__(self, value):
                self.value = value

            def __eq__(self, other):
                return self.value == other.value

            def __add__(self, other):
                return self.__class__(self.value + other.value)

            def __lt__(self, other):
                raise NotImplementedError('This object cannot be compared')

        keys = (0, 1, 2, 0, 1, 2)
        r = [IncomparableValueAddable(i) for i in range(len(keys))]

        k_rdd = self.context.parallelize(zip(keys, r))
        actual_group = k_rdd.reduceByKey(add, numPartitions=20).collect()

        expected_group = (
            (0, IncomparableValueAddable(3)),
            (1, IncomparableValueAddable(5)),
            (2, IncomparableValueAddable(7)),
        )

        grouped_dict = dict(actual_group)

        # Keep this order-agnostic
        for k, v in expected_group:
            self.assertEqual(grouped_dict[k], v)
def test_pyspark_compatibility_gz():
    kv = Context().textFile('{}/pyspark/key_value.txt.gz'.format(LOCAL_TEST_PATH)).collect()
    print(kv)
    assert 'a\t1' in kv and 'b\t2' in kv and len(kv) == 2
def test_pyspark_compatibility_txt():
    kv = Context().textFile('{}/pyspark/key_value.txt'.format(LOCAL_TEST_PATH)).collect()
    print(kv)
    assert "('a', 1)" in kv and "('b', 2)" in kv and len(kv) == 2
from __future__ import print_function

from fast_pyspark_tester import Context

counts = (
    Context().textFile('README.rst').map(lambda line: ''.join(ch if ch.isalnum(
    ) else ' ' for ch in line)).flatMap(lambda line: line.split(' ')).map(
        lambda word: (word, 1)).reduceByKey(lambda a, b: a + b))
print(counts.collect())
Ejemplo n.º 20
0
class SessionTests(TestCase):
    spark = SparkSession(sparkContext=Context())

    def test_session_range(self):
        df = self.spark.range(3)
        self.assertEqual(df.count(), 3)
        self.assertListEqual(df.collect(), [Row(id=0), Row(id=1), Row(id=2)])
        self.assertEqual(list(df.toLocalIterator()), [Row(id=0), Row(id=1), Row(id=2)])

    def test_session_create_data_frame_from_rdd(self):
        df = self.spark.createDataFrame(self.spark.sparkContext.parallelize([(1, 'one'), (2, 'two'), (3, 'three')]))
        self.assertEqual(df.count(), 3)
        self.assertListEqual(
            df.collect(), [Row(_1=1, _2='one'), Row(_1=2, _2='two'), Row(_1=3, _2='three')],
        )
        self.assertEqual(
            df.schema, StructType([StructField('_1', LongType(), True), StructField('_2', StringType(), True)]),
        )

    def test_session_create_data_frame_from_list(self):
        df = self.spark.createDataFrame([(1, 'one'), (2, 'two'), (3, 'three')])
        self.assertEqual(df.count(), 3)
        self.assertListEqual(
            df.collect(), [Row(_1=1, _2='one'), Row(_1=2, _2='two'), Row(_1=3, _2='three')],
        )
        self.assertEqual(
            df.schema, StructType([StructField('_1', LongType(), True), StructField('_2', StringType(), True)]),
        )

    @pytest.mark.skipif(not has_pandas, reason='pandas is not installed')
    def test_session_create_data_frame_from_pandas_data_frame(self):
        try:
            # Pandas is an optional dependency
            # pylint: disable=import-outside-toplevel
            import pandas as pd
        except ImportError:
            raise Exception('pandas is not importable')

        pdf = pd.DataFrame([(1, 'one'), (2, 'two'), (3, 'three')])

        df = self.spark.createDataFrame(pdf)

        self.assertEqual(df.count(), 3)
        self.assertListEqual(
            df.collect(), [Row(**{'0': 1, '1': 'one'}), Row(**{'0': 2, '1': 'two'}), Row(**{'0': 3, '2': 'three'})],
        )
        self.assertEqual(
            df.schema, StructType([StructField('0', LongType(), True), StructField('1', StringType(), True)]),
        )

    def test_session_create_data_frame_from_list_with_col_names(self):
        df = self.spark.createDataFrame(
            [(0.0, [1.0, 0.8]), (1.0, [0.0, 0.0]), (2.0, [0.5, 0.5])], ['label', 'features'],
        )
        self.assertEqual(df.count(), 3)
        self.assertListEqual(
            df.collect(),
            [
                row_from_keyed_values([('label', 0.0), ('features', [1.0, 0.8])]),
                row_from_keyed_values([('label', 1.0), ('features', [0.0, 0.0])]),
                row_from_keyed_values([('label', 2.0), ('features', [0.5, 0.5])]),
            ],
        )

        self.assertEqual(
            df.schema,
            StructType(
                [
                    StructField('label', DoubleType(), True),
                    StructField('features', ArrayType(DoubleType(), True), True),
                ]
            ),
        )

    def test_session_create_data_frame_from_list_with_schema(self):
        schema = StructType([StructField('map', MapType(StringType(), IntegerType()), True)])
        df = self.spark.createDataFrame([({'a': 1},)], schema=schema)
        self.assertEqual(df.count(), 1)
        self.assertListEqual(df.collect(), [Row(map={'a': 1})])
        self.assertEqual(df.schema, schema)

    def test_session_storage_level(self):
        spark = SparkSession(Context())
        df = spark.range(4, numPartitions=2)
        self.assertEqual(repr(df.storageLevel), repr(StorageLevel(False, False, False, False, 1)))
        persisted_df = df.persist()
        self.assertEqual(persisted_df.is_cached, True)
        self.assertEqual(repr(persisted_df.storageLevel), repr(StorageLevel.MEMORY_ONLY))
Ejemplo n.º 21
0
 def setUp(self):
     self.context = Context()
Ejemplo n.º 22
0
from __future__ import print_function

from fast_pyspark_tester import Context

by_subject_rdd = Context().textFile(
    's3n://human-microbiome-project/DEMO/HM16STR/46333/by_subject/*')
print(by_subject_rdd.takeSample(1))
def test_http_textFile():
    myrdd = Context().textFile(
        'https://s3-us-west-2.amazonaws.com/human-microbiome-project/DEMO/' 'HM16STR/46333/by_subject/1139.fsa'
    )
    assert 'TGCTGCGGTGAATGCGTTCCCGGGTCT' in myrdd.collect()
def test_hdfs_textFile_loop():
    random.seed()

    fn = '{}/pysparkling_test_{:d}.txt'.format(HDFS_TEST_PATH, random.random() * 999999.0)
    print('HDFS test file: {0}'.format(fn))

    rdd = Context().parallelize('Hello World {0}'.format(x) for x in range(10))
    rdd.saveAsTextFile(fn)
    read_rdd = Context().textFile(fn)
    print(rdd.collect())
    print(read_rdd.collect())
    assert rdd.count() == read_rdd.count() and all(r1 == r2 for r1, r2 in zip(rdd.collect(), read_rdd.collect()))
def test_read_tar_gz():
    # file was created with:
    # tar -cvzf data.tar.gz hello.txt
    rdd = Context().textFile('{}/data.tar.gz'.format(LOCAL_TEST_PATH))
    print(rdd.collect())
    assert 'Hello fast_pyspark_tester!' in rdd.collect()
from __future__ import print_function

from fast_pyspark_tester import Context

# read all the paths of warc and wat files of the latest Common Crawl
paths_rdd = Context().textFile(
    's3n://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2015-11/'
    'warc.paths.*,'
    's3n://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2015-11/'
    'wat.paths.gz',
)

print(paths_rdd.collect())
from __future__ import print_function

from fast_pyspark_tester import Context

my_rdd = Context().textFile('tests/*.py')
print('In tests/*.py: all lines={0}, with import={1}'.format(
    my_rdd.count(),
    my_rdd.filter(lambda l: l.startswith('import ')).count(),
))