def test_first_mp():
    p = multiprocessing.Pool(4)
    c = Context(pool=p, serializer=cloudpickle.dumps,
                deserializer=pickle.loads)
    my_rdd = c.parallelize([1, 2, 2, 4, 1, 3, 5, 9], 3)
    print(my_rdd.first())
    assert my_rdd.first() == 1
Example #2
0
def test_saveAsTextFile_zip():
    tempFile = tempfile.NamedTemporaryFile(delete=True)
    tempFile.close()
    Context().parallelize(range(10)).saveAsTextFile(tempFile.name + '.zip')
    read_rdd = Context().textFile(tempFile.name + '.zip')
    print(read_rdd.collect())
    assert '5' in read_rdd.collect()
Example #3
0
def test_read_7z():
    # file was created with:
    # 7z a tests/data.7z tests/readme_example.py
    # (brew install p7zip)
    rdd = Context().textFile('tests/data.7z')
    print(rdd.collect())
    assert 'from pysparkling import Context' in rdd.collect()
def test_multiprocessing():
    p = multiprocessing.Pool(4)
    c = Context(pool=p, serializer=dill.dumps, deserializer=dill.loads)
    my_rdd = c.parallelize([1, 3, 4])
    r = my_rdd.map(lambda x: x*x).collect()
    print(r)
    assert 16 in r
Example #5
0
def test_read_7z():
    # file was created with:
    # 7z a tests/data.7z tests/readme_example.py
    # (brew install p7zip)
    rdd = Context().textFile('tests/data.7z')
    print(rdd.collect())
    assert 'from pysparkling import Context' in rdd.collect()
Example #6
0
def test_saveAsTextFile_zip():
    tempFile = tempfile.NamedTemporaryFile(delete=True)
    tempFile.close()
    Context().parallelize(range(10)).saveAsTextFile(tempFile.name+'.zip')
    read_rdd = Context().textFile(tempFile.name+'.zip')
    print(read_rdd.collect())
    assert '5' in read_rdd.collect()
Example #7
0
    def test_algorith_excution(self):
        """Tests the algorithm execution with basic parameters
        """
        folder_path = os.path.dirname(os.path.realpath(__file__))
        json_path = os.path.join(folder_path, 'data', 'objects.json')

        dataset = Context(
            serializer=cloudpickle.dumps,
            deserializer=pickle.loads,
        ).textFile(json_path).map(json.loads)

        dataset.persist()

        number_of_cluster = 2

        algorithm_settings = settings.ClusteringSetting(
            number_of_cluster, 2, 0.01, AlgorithmProvider.random_distance)

        labeled, centroids = kmeans.compute_cluster(
            dataset, algorithm_settings, AlgorithmProvider.random_sampling,
            AlgorithmProvider.random_cluster, AlgorithmProvider.dummy_update)

        labeled.collect()
        self.assertEqual(10, labeled.count())

        first_item = labeled.first()
        self.assertTrue(0 <= first_item[0] < 2)
        self.assertEqual(number_of_cluster, len(centroids))
Example #8
0
def test_local_regex_read():
    # was not working before 0.3.19
    tempFile = tempfile.NamedTemporaryFile(delete=True)
    tempFile.close()
    Context().parallelize(range(30), 30).saveAsTextFile(tempFile.name)
    d = Context().textFile(tempFile.name + '/part-0000*').collect()
    print(d)
    assert len(d) == 10
def test_lazy_execution():
    r = Context().textFile('tests/test_multiprocessing.py')
    r = r.map(indent_line)
    exec_before_collect = INDENT_WAS_EXECUTED
    # at this point, no map() or foreach() should have been executed
    r.collect()
    exec_after_collect = INDENT_WAS_EXECUTED
    assert not exec_before_collect and exec_after_collect
Example #10
0
def test_s3_textFile():
    myrdd = Context().textFile(
        's3n://aws-publicdatasets/common-crawl/crawl-data/'
        'CC-MAIN-2015-11/warc.paths.*')
    assert (
        'common-crawl/crawl-data/CC-MAIN-2015-11/segments/1424937481488.49/'
        'warc/CC-MAIN-20150226075801-00329-ip-10-28-5-156.ec2.'
        'internal.warc.gz' in myrdd.collect())
def test_filter():
    my_rdd = Context().parallelize(
        [1, 2, 2, 4, 1, 3, 5, 9],
        3,
    ).filter(lambda x: x % 2 == 0)
    print(my_rdd.collect())
    print(my_rdd.count())
    assert my_rdd.count() == 3
def test_multiprocessing():
    p = multiprocessing.Pool(4)
    c = Context(pool=p, serializer=cloudpickle.dumps,
                deserializer=pickle.loads)
    my_rdd = c.parallelize([1, 3, 4])
    r = my_rdd.map(lambda x: x*x).collect()
    print(r)
    assert 16 in r
def test_mapPartitions():
    rdd = Context().parallelize([1, 2, 3, 4], 2)

    def f(iterator):
        yield sum(iterator)

    r = rdd.mapPartitions(f).collect()
    assert 3 in r and 7 in r
def test_lazy_execution_threadpool():
    with futures.ThreadPoolExecutor(4) as p:
        r = Context(pool=p).textFile('tests/test_multiprocessing.py')
        r = r.map(indent_line)
        r = r.map(indent_line)
        r = r.collect()
        # ThreadPool is not lazy although it returns generators.
        print(r)
        assert '--- --- from pysparkling import Context' in r
Example #15
0
def test_hdfs_file_exists():
    random.seed()

    fn1 = f'{HDFS_TEST_PATH}/pysparkling_test_{random.random() * 999999.0:d}.txt'
    fn2 = f'{HDFS_TEST_PATH}/pysparkling_test_{random.random() * 999999.0:d}.txt'

    rdd = Context().parallelize(f'Hello World {x}' for x in range(10))
    rdd.saveAsTextFile(fn1)

    assert File(fn1).exists() and not File(fn2).exists()
Example #16
0
def test_s3_textFile():
    myrdd = Context().textFile(
        's3n://aws-publicdatasets/common-crawl/crawl-data/'
        'CC-MAIN-2015-11/warc.paths.*'
    )
    assert (
        'common-crawl/crawl-data/CC-MAIN-2015-11/segments/1424937481488.49/'
        'warc/CC-MAIN-20150226075801-00329-ip-10-28-5-156.ec2.'
        'internal.warc.gz' in myrdd.collect()
    )
Example #17
0
    def create_context(n_processes=0):
        if not n_processes:
            return Context()

        p = futures.ProcessPoolExecutor(n_processes)
        return Context(
            pool=p,
            serializer=cloudpickle.dumps,
            # serializer=pickle.dumps,
            deserializer=pickle.loads,
        )
Example #18
0
def test_hdfs_file_exists():
    random.seed()

    fn1 = '{}/pysparkling_test_{:d}.txt'.format(HDFS_TEST_PATH,
                                                random.random() * 999999.0)
    fn2 = '{}/pysparkling_test_{:d}.txt'.format(HDFS_TEST_PATH,
                                                random.random() * 999999.0)

    rdd = Context().parallelize('Hello World {0}'.format(x) for x in range(10))
    rdd.saveAsTextFile(fn1)

    assert File(fn1).exists() and not File(fn2).exists()
Example #19
0
def test_s3_textFile_loop():
    random.seed()

    fn = f'{S3_TEST_PATH}/pysparkling_test_{random.random() * 999999.0:d}.txt'

    rdd = Context().parallelize(f'Line {n}' for n in range(200))
    rdd.saveAsTextFile(fn)
    rdd_check = Context().textFile(fn)

    assert (rdd.count() == rdd_check.count()
            and all(e1 == e2
                    for e1, e2 in zip(rdd.collect(), rdd_check.collect())))
Example #20
0
def test_hdfs_file_exists():
    random.seed()

    fn1 = '{}/pysparkling_test_{:d}.txt'.format(
        HDFS_TEST_PATH, random.random() * 999999.0)
    fn2 = '{}/pysparkling_test_{:d}.txt'.format(
        HDFS_TEST_PATH, random.random() * 999999.0)

    rdd = Context().parallelize('Hello World {0}'.format(x) for x in range(10))
    rdd.saveAsTextFile(fn1)

    assert File(fn1).exists() and not File(fn2).exists()
Example #21
0
def test_gs_textFile_loop():
    random.seed()

    fn = '{}/pysparkling_test_{:d}.txt'.format(GS_TEST_PATH,
                                               random.random() * 999999.0)

    rdd = Context().parallelize('Line {0}'.format(n) for n in range(200))
    rdd.saveAsTextFile(fn)
    rdd_check = Context().textFile(fn)

    assert (rdd.count() == rdd_check.count()
            and all(e1 == e2
                    for e1, e2 in zip(rdd.collect(), rdd_check.collect())))
Example #22
0
def run_feature_extraction():
    start_time = time.time()
    desc = 'Feature Extraction for Images'
    parser = argparse.ArgumentParser(
        description=desc,
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=desc)
    default_path = '/media/chris/cschulze_external_4tb/receipt_classifier_images/nonreceipts/train2014'
    # default_path = '/media/chris/cschulze_external_4tb/elliot_data/train_nonpill'
    # default_path = '/train_nonpill'

    parser.add_argument("--input_dir",
                        help="input directory",
                        default=default_path)
    parser.add_argument("--output",
                        help="output file",
                        default='image_features')
    args = parser.parse_args()
    # serialize and put all images in rdd:
    # use json schema:
    #     "image_name": "",
    #     "bytes": ""
    #     "features": "array[]"
    image_dir_path = args.input_dir
    df, data_arr = serialize_and_make_df(image_dir_path)
    print df.head()
    print df.info()

    # df to df_cvs:
    csv_df_file = 'dataframe_csv_file.csv'
    json_df_file = 'dataframe_csv_file.json'
    df.to_csv(csv_df_file, header=False, index=False)
    # df.to_json(json_df_file)

    # rdd from df_csv
    # pysparkling:
    sc = Context()

    # pyspark:
    # conf = SparkConf().setAppName("HOG and GIST ETL")
    # sc = SparkContext(conf=conf)

    # rdd = sc.textFile(json_df_file)
    num_parts = 4
    rdd = sc.parallelize(data_arr, num_parts)
    # submit image rdd to processing
    rdd_features = rdd.map(get_features).coalesce(1)
    # save as txt file:
    rdd_features.map(dump).saveAsTextFile(args.output)
    print "------------------ %f minutes elapsed ------------------------" % (
        (time.time() - start_time) / 60.0)
def test_processpool_distributed_cache():
    with futures.ProcessPoolExecutor(4) as p:
        r = Context(
            pool=p,
            serializer=cloudpickle.dumps,
            deserializer=pickle.loads,
        ).parallelize(range(3), 3)
        r = r.map(lambda _: time.sleep(0.1)).cache()
        r.collect()

        time_start = time.time()
        print(r.collect())
        time_end = time.time()
        assert time_end - time_start < 0.3
Example #24
0
def test_hdfs_file_exists():
    if not HDFS_TEST_PATH:
        raise SkipTest

    random.seed()

    fn1 = HDFS_TEST_PATH + '/pysparkling_test_{0}.txt'.format(
        int(random.random() * 999999.0))
    fn2 = HDFS_TEST_PATH + '/pysparkling_test_{0}.txt'.format(
        int(random.random() * 999999.0))

    rdd = Context().parallelize('Hello World {0}'.format(x) for x in range(10))
    rdd.saveAsTextFile(fn1)

    assert File(fn1).exists() and not File(fn2).exists()
Example #25
0
 def test_session_storage_level(self):
     spark = SparkSession(Context())
     df = spark.range(4, numPartitions=2)
     self.assertEqual(repr(df.storageLevel), repr(StorageLevel(False, False, False, False, 1)))
     persisted_df = df.persist()
     self.assertEqual(persisted_df.is_cached, True)
     self.assertEqual(repr(persisted_df.storageLevel), repr(StorageLevel.MEMORY_ONLY))
Example #26
0
def test_s3_textFile_loop():
    if not AWS_ACCESS_KEY_ID or not S3_TEST_PATH:
        raise SkipTest

    random.seed()

    fn = '{}/pysparkling_test_{0}.txt'.format(S3_TEST_PATH,
                                              int(random.random() * 999999.0))

    rdd = Context().parallelize("Line {0}".format(n) for n in range(200))
    rdd.saveAsTextFile(fn)
    rdd_check = Context().textFile(fn)

    assert (rdd.count() == rdd_check.count()
            and all(e1 == e2
                    for e1, e2 in zip(rdd.collect(), rdd_check.collect())))
Example #27
0
def test_gs_textFile_loop():
    if not OAUTH2_CLIENT_ID or not GS_TEST_PATH:
        raise SkipTest

    random.seed()

    fn = GS_TEST_PATH + '/pysparkling_test_{0}.txt'.format(
        int(random.random() * 999999.0))

    rdd = Context().parallelize("Line {0}".format(n) for n in range(200))
    rdd.saveAsTextFile(fn)
    rdd_check = Context().textFile(fn)

    assert (rdd.count() == rdd_check.count()
            and all(e1 == e2
                    for e1, e2 in zip(rdd.collect(), rdd_check.collect())))
Example #28
0
def test_hdfs_textFile_loop():
    random.seed()

    fn = '{}/pysparkling_test_{:d}.txt'.format(
        HDFS_TEST_PATH, random.random() * 999999.0)
    print('HDFS test file: {0}'.format(fn))

    rdd = Context().parallelize('Hello World {0}'.format(x) for x in range(10))
    rdd.saveAsTextFile(fn)
    read_rdd = Context().textFile(fn)
    print(rdd.collect())
    print(read_rdd.collect())
    assert (
        rdd.count() == read_rdd.count() and
        all(r1 == r2 for r1, r2 in zip(rdd.collect(), read_rdd.collect()))
    )
Example #29
0
def test_hdfs_file_exists():
    if not HDFS_TEST_PATH:
        raise SkipTest

    random.seed()

    fn1 = HDFS_TEST_PATH+'/pysparkling_test_{0}.txt'.format(
        int(random.random()*999999.0)
    )
    fn2 = HDFS_TEST_PATH+'/pysparkling_test_{0}.txt'.format(
        int(random.random()*999999.0)
    )

    rdd = Context().parallelize('Hello World {0}'.format(x) for x in range(10))
    rdd.saveAsTextFile(fn1)

    assert File(fn1).exists() and not File(fn2).exists()
Example #30
0
def test_hdfs_textFile_loop():
    if not HDFS_TEST_PATH:
        raise SkipTest

    random.seed()

    fn = HDFS_TEST_PATH+'/pysparkling_test_{0}.txt'.format(
        int(random.random()*999999.0)
    )

    rdd = Context().parallelize('Hello World {0}'.format(x) for x in range(10))
    rdd.saveAsTextFile(fn)
    read_rdd = Context().textFile(fn)
    assert (
        rdd.count() == read_rdd.count() and
        all(r1 == r2 for r1, r2 in zip(rdd.collect(), read_rdd.collect()))
    )
Example #31
0
def test_saveAsTextFile():
    tempFile = tempfile.NamedTemporaryFile(delete=True)
    tempFile.close()
    Context().parallelize(range(10)).saveAsTextFile(tempFile.name)
    with open(tempFile.name, 'r') as f:
        r = f.readlines()
        print(r)
        assert '5\n' in r
Example #32
0
def test_cache():
    # this crashes in version 0.2.28
    lines = Context().textFile('tests/*textFil*.py')
    lines = lines.map(lambda l: '-' + l).cache()
    print(len(lines.collect()))
    lines = lines.map(lambda l: '+' + l)
    lines = lines.map(lambda l: '-' + l).cache()
    lines = lines.collect()
    print(lines)
    assert '-+-from pysparkling import Context' in lines
Example #33
0
def test_gs_textFile_loop():
    random.seed()

    fn = '{}/pysparkling_test_{:d}.txt'.format(
        GS_TEST_PATH, random.random() * 999999.0)

    rdd = Context().parallelize('Line {0}'.format(n) for n in range(200))
    rdd.saveAsTextFile(fn)
    rdd_check = Context().textFile(fn)

    assert (
        rdd.count() == rdd_check.count() and
        all(e1 == e2 for e1, e2 in zip(rdd.collect(), rdd_check.collect()))
    )
def test_cache():
    my_rdd = Context().parallelize([1, 2, 3, 4], 2)
    my_rdd = my_rdd.map(lambda x: x*x).cache()
    print('no exec until here')
    print(my_rdd.first())
    print('executed map on first partition only')
    print(my_rdd.collect())
    print('now map() was executed on all partitions and should '
          'not be executed again')
    print(my_rdd.collect())
    assert len(my_rdd.collect()) == 4 and 16 in my_rdd.collect()
Example #35
0
def test_lazy_execution_threadpool():
    def indent_line(l):
        return '--- ' + l

    with futures.ThreadPoolExecutor(4) as p:
        r = Context(pool=p).textFile('tests/test_multiprocessing.py')
        r = r.map(indent_line).cache()
        r.collect()
        r = r.map(indent_line)
        r = r.collect()
        # ThreadPool is not lazy although it returns generators.
        print(r)
        assert '--- --- from pysparkling import Context' in r
Example #36
0
def test_lazy_execution_processpool():
    def indent_line(l):
        return '--- ' + l

    with futures.ProcessPoolExecutor(4) as p:
        r = Context(
            pool=p,
            serializer=cloudpickle.dumps,
            deserializer=pickle.loads,
        ).textFile('tests/test_multiprocessing.py')  # .take(10)
        print(r.collect())
        r = r.map(indent_line)
        print(r.collect())
        r = r.cache()
        print(r.collect())
        r = r.map(indent_line)
        r = r.collect()
        # ProcessPool is not lazy although it returns generators.
        print(r)
        assert '--- --- from pysparkling import Context' in r
Example #37
0
def test_cache():
    # this crashes in version 0.2.28
    lines = Context().textFile('tests/*textFil*.py')
    lines = lines.map(lambda l: '-'+l).cache()
    print(len(lines.collect()))
    lines = lines.map(lambda l: '+'+l)
    lines = lines.map(lambda l: '-'+l).cache()
    lines = lines.collect()
    print(lines)
    assert '-+-from pysparkling import Context' in lines
Example #38
0
def test_gs_textFile_loop():
    if not OAUTH2_CLIENT_ID or not GS_TEST_PATH:
        raise SkipTest

    random.seed()

    fn = '{}/pysparkling_test_{0}.txt'.format(
        GS_TEST_PATH, int(random.random() * 999999.0)
    )

    rdd = Context().parallelize("Line {0}".format(n) for n in range(200))
    rdd.saveAsTextFile(fn)
    rdd_check = Context().textFile(fn)

    assert (
        rdd.count() == rdd_check.count() and
        all(e1 == e2 for e1, e2 in zip(rdd.collect(), rdd_check.collect()))
    )
Example #39
0
def test_s3_textFile_loop():
    if not AWS_ACCESS_KEY_ID or not S3_TEST_PATH:
        raise SkipTest

    random.seed()

    fn = S3_TEST_PATH+'/pysparkling_test_{0}.txt'.format(
        int(random.random()*999999.0)
    )

    rdd = Context().parallelize("Line {0}".format(n) for n in range(200))
    rdd.saveAsTextFile(fn)
    rdd_check = Context().textFile(fn)

    assert (
        rdd.count() == rdd_check.count() and
        all(e1 == e2 for e1, e2 in zip(rdd.collect(), rdd_check.collect()))
    )
def test_lazy_execution_processpool():
    def indent_line(l):
        return '--- '+l

    with futures.ProcessPoolExecutor(4) as p:
        r = Context(
            pool=p,
            serializer=cloudpickle.dumps,
            deserializer=pickle.loads,
        ).textFile('tests/test_multiprocessing.py')  # .take(10)
        print(r.collect())
        r = r.map(indent_line)
        print(r.collect())
        r = r.cache()
        print(r.collect())
        r = r.map(indent_line)
        r = r.collect()
        # ProcessPool is not lazy although it returns generators.
        print(r)
        assert '--- --- from pysparkling import Context' in r
Example #41
0
def test_processpool_distributed_cache():
    with futures.ProcessPoolExecutor(4) as p:
        r = Context(
            pool=p,
            serializer=cloudpickle.dumps,
            deserializer=pickle.loads,
        ).parallelize(range(3), 3)
        r = r.map(lambda _: time.sleep(0.1)).cache()
        r.collect()

        time_start = time.time()
        print(r.collect())
        time_end = time.time()
        assert time_end - time_start < 0.3
def test_lazy_execution_processpool():
    with futures.ProcessPoolExecutor(4) as p:
        r = Context(
            pool=p,
            serializer=dill.dumps,
            deserializer=dill.loads,
        ).textFile('tests/test_multiprocessing.py')
        r = r.map(indent_line).cache()
        r.collect()
        r = r.map(indent_line)
        r = r.collect()
        # ProcessPool is not lazy although it returns generators.
        print(r)
        assert '--- --- from pysparkling import Context' in r
Example #43
0
def test_lazy_execution():
    class I(object):
        def __init__(self):
            self.executed = False

        def indent_line(self, l):
            # global indent_was_executed
            self.executed = True
            return '--- ' + l

    r = Context().textFile('tests/test_multiprocessing.py')
    i = I()

    r = r.map(i.indent_line)
    exec_before_collect = i.executed
    # at this point, no map() or foreach() should have been executed
    r = r.map(i.indent_line).cache()
    print(r.collect())
    r = r.map(i.indent_line)
    r.collect()
    exec_after_collect = i.executed
    print((exec_before_collect, exec_after_collect))
    assert not exec_before_collect and exec_after_collect
def test_lazy_execution():

    class I(object):
        def __init__(self):
            self.executed = False

        def indent_line(self, l):
            # global indent_was_executed
            self.executed = True
            return '--- '+l

    r = Context().textFile('tests/test_multiprocessing.py')
    i = I()

    r = r.map(i.indent_line)
    exec_before_collect = i.executed
    # at this point, no map() or foreach() should have been executed
    r = r.map(i.indent_line).cache()
    print(r.collect())
    r = r.map(i.indent_line)
    r.collect()
    exec_after_collect = i.executed
    print((exec_before_collect, exec_after_collect))
    assert not exec_before_collect and exec_after_collect
Example #45
0
def test_local_textFile_name():
    name = Context().textFile('tests/*.py').name()
    print(name)
    assert name == 'tests/*.py'
Example #46
0
def test_local_textFile_1():
    lines = Context().textFile('tests/*textFil*.py').collect()
    print(lines)
    assert 'from pysparkling import Context' in lines
Example #47
0
def test_local_textFile_2():
    line_count = Context().textFile('tests/*.py').count()
    print(line_count)
    assert line_count > 90
Example #48
0
def test_pyspark_compatibility_gz():
    kv = Context().textFile('tests/pyspark/key_value.txt.gz').collect()
    print(kv)
    assert u"a\t1" in kv and u"b\t2" in kv and len(kv) == 2
Example #49
0
from pysparkling import Context

my_rdd = Context().textFile("tests/*.py")
print(
    "In tests/*.py: all lines={0}, with import={1}".format(
        my_rdd.count(), my_rdd.filter(lambda l: l.startswith("import ")).count()
    )
)
from pysparkling import Context

counts = Context().textFile(
    'README.rst'
).flatMap(
    lambda line: line.split(' ')
).map(
    lambda word: (word, 1)
).reduceByKey(
    lambda a, b: a + b
)
print(counts.collect())
Example #51
0
def test_pyspark_compatibility_txt():
    kv = Context().textFile('tests/pyspark/key_value.txt').collect()
    print(kv)
    assert u"('a', 1)" in kv and u"('b', 2)" in kv and len(kv) == 2
Example #52
0
def test_http_textFile():
    myrdd = Context().textFile(
        'https://s3-us-west-2.amazonaws.com/human-microbiome-project/DEMO/'
        'HM16STR/46333/by_subject/1139.fsa'
    )
    assert u'TGCTGCGGTGAATGCGTTCCCGGGTCT' in myrdd.collect()
Example #53
0
class RDDTest(unittest.TestCase):
    """Tests for the resilient distributed databases"""

    def setUp(self):
        self.context = Context()

    def testLeftOuterJoinSimple(self):
        """Test the basic left outer join with simple key-value pairs"""
        x = self.context.parallelize([('a', 'xa'), ('b', 'xb'), ('c', 'xc')])
        y = self.context.parallelize([('b', 'yb'), ('c', 'yc')])
        z = self.context.parallelize([('c', 'zc'), ('d', 'zd')])

        xy = sorted(x.leftOuterJoin(y).collect())
        xz = sorted(x.leftOuterJoin(z).collect())
        zx = sorted(z.leftOuterJoin(x).collect())

        self.assertEqual(xy, [('a', ('xa', None)),
                              ('b', ('xb', 'yb')),
                              ('c', ('xc', 'yc'))])

        self.assertEqual(xz, [('a', ('xa', None)),
                              ('b', ('xb', None)),
                              ('c', ('xc', 'zc'))])

        self.assertEqual(zx, [('c', ('zc', 'xc')),
                              ('d', ('zd', None))])

    def testLeftOuterJoinDuplicate(self):
        """Test the left outer join with duplicate keys"""
        x = self.context.parallelize([('a', 'xa'), ('c', 'xc1'), ('c', 'xc2')])
        y = self.context.parallelize([('b', 'yb'), ('c', 'yc')])
        z = self.context.parallelize([('c', 'zc1'), ('c', 'zc2'), ('d', 'zd')])

        xy = sorted(x.leftOuterJoin(y).collect())
        xz = sorted(x.leftOuterJoin(z).collect())

        self.assertEqual(xy, [('a', ('xa', None)),
                              ('c', ('xc1', 'yc')),
                              ('c', ('xc2', 'yc'))])

        # Two sets of duplicate keys gives cartesian product
        self.assertEqual(xz, [('a', ('xa', None)),
                              ('c', ('xc1', 'zc1')),
                              ('c', ('xc1', 'zc2')),
                              ('c', ('xc2', 'zc1')),
                              ('c', ('xc2', 'zc2'))])

    def testRightOuterJoinSimple(self):
        """Test the basic right outer join with simple key-value pairs"""
        x = self.context.parallelize([('a', 'xa'), ('b', 'xb'), ('c', 'xc')])
        y = self.context.parallelize([('b', 'yb'), ('c', 'yc')])
        z = self.context.parallelize([('c', 'zc'), ('d', 'zd')])

        xy = sorted(x.rightOuterJoin(y).collect())
        xz = sorted(x.rightOuterJoin(z).collect())
        zx = sorted(z.rightOuterJoin(x).collect())

        self.assertEqual(xy, [('b', ('xb', 'yb')),
                              ('c', ('xc', 'yc'))])

        self.assertEqual(xz, [('c', ('xc', 'zc')),
                              ('d', (None, 'zd'))])

        self.assertEqual(zx, [('a', (None, 'xa')),
                              ('b', (None, 'xb')),
                              ('c', ('zc', 'xc'))])

    def testRightOuterJoinDuplicate(self):
        """Test the right outer join with duplicate keys"""
        x = self.context.parallelize([('a', 'xa'), ('c', 'xc1'), ('c', 'xc2')])
        y = self.context.parallelize([('b', 'yb'), ('c', 'yc')])
        z = self.context.parallelize([('c', 'zc1'), ('c', 'zc2'), ('d', 'zd')])

        xy = sorted(x.rightOuterJoin(y).collect())
        xz = sorted(x.rightOuterJoin(z).collect())

        self.assertEqual(xy, [('b', (None, 'yb')),
                              ('c', ('xc1', 'yc')),
                              ('c', ('xc2', 'yc'))])

        # Two sets of duplicate keys gives cartesian product
        self.assertEqual(xz, [('c', ('xc1', 'zc1')),
                              ('c', ('xc1', 'zc2')),
                              ('c', ('xc2', 'zc1')),
                              ('c', ('xc2', 'zc2')),
                              ('d', (None, 'zd'))])

    def testFullOuterJoinSimple(self):
        """Test the basic full outer join with simple key-value pairs"""
        x = self.context.parallelize([('a', 'xa'), ('b', 'xb'), ('c', 'xc')])
        y = self.context.parallelize([('b', 'yb'), ('c', 'yc')])
        z = self.context.parallelize([('c', 'zc'), ('d', 'zd')])

        xy = sorted(x.fullOuterJoin(y).collect())
        xz = sorted(x.fullOuterJoin(z).collect())
        zx = sorted(z.fullOuterJoin(x).collect())

        self.assertEqual(xy, [('a', ('xa', None)),
                              ('b', ('xb', 'yb')),
                              ('c', ('xc', 'yc'))])

        self.assertEqual(xz, [('a', ('xa', None)),
                              ('b', ('xb', None)),
                              ('c', ('xc', 'zc')),
                              ('d', (None, 'zd'))])

        self.assertEqual(zx, [('a', (None, 'xa')),
                              ('b', (None, 'xb')),
                              ('c', ('zc', 'xc')),
                              ('d', ('zd', None))])

    def testFullOuterJoinDuplicate(self):
        """Test the full outer join with duplicate keys"""
        x = self.context.parallelize([('a', 'xa'), ('c', 'xc1'), ('c', 'xc2')])
        y = self.context.parallelize([('b', 'yb'), ('c', 'yc')])
        z = self.context.parallelize([('c', 'zc1'), ('c', 'zc2'), ('d', 'zd')])

        xy = sorted(x.fullOuterJoin(y).collect())
        xz = sorted(x.fullOuterJoin(z).collect())

        self.assertEqual(xy, [('a', ('xa', None)),
                              ('b', (None, 'yb')),
                              ('c', ('xc1', 'yc')),
                              ('c', ('xc2', 'yc'))])

        # Two sets of duplicate keys gives cartesian product
        self.assertEqual(xz, [('a', ('xa', None)),
                              ('c', ('xc1', 'zc1')),
                              ('c', ('xc1', 'zc2')),
                              ('c', ('xc2', 'zc1')),
                              ('c', ('xc2', 'zc2')),
                              ('d', (None, 'zd'))])

    def test_cartesian(self):
        x = self.context.parallelize(range(0, 2))
        y = self.context.parallelize(range(3, 6))
        c = x.cartesian(y)
        result = sorted(c.collect())
        expected = sorted([(0, 3), (0, 4), (0, 5), (1, 3), (1, 4), (1, 5)])
        self.assertListEqual(result, expected)

    def test_sample(self):
        rdd = self.context.parallelize(range(100), 4)
        self.assertTrue(6 <= rdd.sample(False, 0.1, 81).count() <= 14)

    def test_sampleByKey(self):
        fractions = {"a": 0.2, "b": 0.1}
        range_rdd = self.context.parallelize(range(0, 1000))
        rdd = self.context.parallelize(fractions.keys()).cartesian(range_rdd)
        sample = dict(
            rdd.sampleByKey(False, fractions, 2).groupByKey().collect()
        )
        self.assertTrue(100 < len(sample["a"]) < 300 and
                        50 < len(sample["b"]) < 150)
        self.assertTrue(max(sample["a"]) <= 999 and min(sample["a"]) >= 0)
        self.assertTrue(max(sample["b"]) <= 999 and min(sample["b"]) >= 0)

    def test_groupByKey(self):
        # This will fail if the values of the RDD need to be compared
        class IncomparableValue(object):
            def __init__(self, value):
                self.value = value

            def __eq__(self, other):
                return self.value == other.value

            def __lt__(self, other):
                raise NotImplementedError("This object cannot be compared")

        keys = (0, 1, 2, 0, 1, 2)
        r = [IncomparableValue(i) for i in range(len(keys))]

        k_rdd = self.context.parallelize(zip(keys, r))
        actual_group = k_rdd.groupByKey().collect()

        expected_group = ((0, r[0::3]),
                          (1, r[1::3]),
                          (2, r[2::3]))

        grouped_dict = {k: v for k, v in actual_group}

        for k, v in expected_group:
            self.assertIn(k, grouped_dict)

            for vv in v:
                self.assertIn(vv, grouped_dict[k])

    def test_reduceByKey(self):
        # This will fail if the values of the RDD need to be compared
        class IncomparableValueAddable(object):
            def __init__(self, value):
                self.value = value

            def __eq__(self, other):
                return self.value == other.value

            def __add__(self, other):
                return self.__class__(self.value + other.value)

            def __lt__(self, other):
                raise NotImplementedError("This object cannot be compared")

        keys = (0, 1, 2, 0, 1, 2)
        r = [IncomparableValueAddable(i) for i in range(len(keys))]

        k_rdd = self.context.parallelize(zip(keys, r))
        actual_group = k_rdd.reduceByKey(add).collect()

        expected_group = ((0, IncomparableValueAddable(3)),
                          (1, IncomparableValueAddable(5)),
                          (2, IncomparableValueAddable(7)))

        grouped_dict = {k: v for k, v in actual_group}

        # Keep this order-agnostic
        for k, v in expected_group:
            self.assertEqual(grouped_dict[k], v)
Example #54
0
def test_wholeTextFiles():
    t = Context().wholeTextFiles('tests/*.py').lookup('tests/test_textFile.py')
    print(t)
    assert 'test_wholeTextFiles' in t[0]
from pysparkling import Context

# read all the paths of warc and wat files of the latest Common Crawl
paths_rdd = Context().textFile(
    's3n://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2015-11/warc.paths.*,'
    's3n://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2015-11/wat.paths.gz'
)

print(paths_rdd.collect())
Example #56
0
def test_wholeTextFiles():
    all_files = Context().wholeTextFiles('{}/*.py'.format(LOCAL_TEST_PATH))
    this_file = all_files.lookup(__file__)
    print(this_file)
    assert 'test_wholeTextFiles' in this_file[0]
Example #57
0
def test_local_textFile_name():
    name = Context().textFile('{}/*.py'.format(LOCAL_TEST_PATH)).name()
    print(name)
    assert name.startswith('{}/*.py'.format(LOCAL_TEST_PATH))
Example #58
0
 def setUp(self):
     self.context = Context()