Esempio n. 1
0
def test_saveAsTextFile_zip():
    tempFile = tempfile.NamedTemporaryFile(delete=True)
    tempFile.close()
    Context().parallelize(range(10)).saveAsTextFile(tempFile.name + '.zip')
    read_rdd = Context().textFile(tempFile.name + '.zip')
    print(read_rdd.collect())
    assert '5' in read_rdd.collect()
Esempio n. 2
0
def test_local_regex_read():
    # was not working before 0.3.19
    tempFile = tempfile.NamedTemporaryFile(delete=True)
    tempFile.close()
    Context().parallelize(range(30), 30).saveAsTextFile(tempFile.name)
    d = Context().textFile(tempFile.name + '/part-0000*').collect()
    print(d)
    assert len(d) == 10
Esempio n. 3
0
    def create_context(n_processes=0):
        if not n_processes:
            return Context()

        p = futures.ProcessPoolExecutor(n_processes)
        return Context(
            pool=p,
            serializer=cloudpickle.dumps,
            # serializer=pickle.dumps,
            deserializer=pickle.loads,
        )
Esempio n. 4
0
def test_s3_textFile_loop():
    random.seed()

    fn = f'{S3_TEST_PATH}/pysparkling_test_{random.random() * 999999.0:d}.txt'

    rdd = Context().parallelize(f'Line {n}' for n in range(200))
    rdd.saveAsTextFile(fn)
    rdd_check = Context().textFile(fn)

    assert (rdd.count() == rdd_check.count()
            and all(e1 == e2
                    for e1, e2 in zip(rdd.collect(), rdd_check.collect())))
Esempio n. 5
0
def test_gs_textFile_loop():
    random.seed()

    fn = '{}/pysparkling_test_{:d}.txt'.format(GS_TEST_PATH,
                                               random.random() * 999999.0)

    rdd = Context().parallelize('Line {0}'.format(n) for n in range(200))
    rdd.saveAsTextFile(fn)
    rdd_check = Context().textFile(fn)

    assert (rdd.count() == rdd_check.count()
            and all(e1 == e2
                    for e1, e2 in zip(rdd.collect(), rdd_check.collect())))
Esempio n. 6
0
def test_hdfs_textFile_loop():
    random.seed()

    fn = f'{HDFS_TEST_PATH}/pysparkling_test_{random.random() * 999999.0:d}.txt'
    print(f'HDFS test file: {fn}')

    rdd = Context().parallelize(f'Hello World {x}' for x in range(10))
    rdd.saveAsTextFile(fn)
    read_rdd = Context().textFile(fn)
    print(rdd.collect())
    print(read_rdd.collect())
    assert (rdd.count() == read_rdd.count()
            and all(r1 == r2
                    for r1, r2 in zip(rdd.collect(), read_rdd.collect())))
Esempio n. 7
0
def test_hdfs_textFile_loop():
    random.seed()

    fn = '{}/pysparkling_test_{:d}.txt'.format(HDFS_TEST_PATH,
                                               random.random() * 999999.0)
    print('HDFS test file: {0}'.format(fn))

    rdd = Context().parallelize('Hello World {0}'.format(x) for x in range(10))
    rdd.saveAsTextFile(fn)
    read_rdd = Context().textFile(fn)
    print(rdd.collect())
    print(read_rdd.collect())
    assert (rdd.count() == read_rdd.count()
            and all(r1 == r2
                    for r1, r2 in zip(rdd.collect(), read_rdd.collect())))
Esempio n. 8
0
def test_read_7z():
    # file was created with:
    # 7z a tests/data.7z tests/readme_example.py
    # (brew install p7zip)
    rdd = Context().textFile('tests/data.7z')
    print(rdd.collect())
    assert 'from pysparkling import Context' in rdd.collect()
Esempio n. 9
0
    def test_algorith_excution(self):
        """Tests the algorithm execution with basic parameters
        """
        folder_path = os.path.dirname(os.path.realpath(__file__))
        json_path = os.path.join(folder_path, 'data', 'objects.json')

        dataset = Context(
            serializer=cloudpickle.dumps,
            deserializer=pickle.loads,
        ).textFile(json_path).map(json.loads)

        dataset.persist()

        number_of_cluster = 2

        algorithm_settings = settings.ClusteringSetting(
            number_of_cluster, 2, 0.01, AlgorithmProvider.random_distance)

        labeled, centroids = kmeans.compute_cluster(
            dataset, algorithm_settings, AlgorithmProvider.random_sampling,
            AlgorithmProvider.random_cluster, AlgorithmProvider.dummy_update)

        labeled.collect()
        self.assertEqual(10, labeled.count())

        first_item = labeled.first()
        self.assertTrue(0 <= first_item[0] < 2)
        self.assertEqual(number_of_cluster, len(centroids))
Esempio n. 10
0
 def test_session_storage_level(self):
     spark = SparkSession(Context())
     df = spark.range(4, numPartitions=2)
     self.assertEqual(repr(df.storageLevel), repr(StorageLevel(False, False, False, False, 1)))
     persisted_df = df.persist()
     self.assertEqual(persisted_df.is_cached, True)
     self.assertEqual(repr(persisted_df.storageLevel), repr(StorageLevel.MEMORY_ONLY))
Esempio n. 11
0
def test_first_mp():
    p = multiprocessing.Pool(4)
    c = Context(pool=p, serializer=cloudpickle.dumps,
                deserializer=pickle.loads)
    my_rdd = c.parallelize([1, 2, 2, 4, 1, 3, 5, 9], 3)
    print(my_rdd.first())
    assert my_rdd.first() == 1
Esempio n. 12
0
def test_s3_textFile_loop():
    if not AWS_ACCESS_KEY_ID or not S3_TEST_PATH:
        raise SkipTest

    random.seed()

    fn = '{}/pysparkling_test_{0}.txt'.format(S3_TEST_PATH,
                                              int(random.random() * 999999.0))

    rdd = Context().parallelize("Line {0}".format(n) for n in range(200))
    rdd.saveAsTextFile(fn)
    rdd_check = Context().textFile(fn)

    assert (rdd.count() == rdd_check.count()
            and all(e1 == e2
                    for e1, e2 in zip(rdd.collect(), rdd_check.collect())))
Esempio n. 13
0
def test_gs_textFile_loop():
    if not OAUTH2_CLIENT_ID or not GS_TEST_PATH:
        raise SkipTest

    random.seed()

    fn = GS_TEST_PATH + '/pysparkling_test_{0}.txt'.format(
        int(random.random() * 999999.0))

    rdd = Context().parallelize("Line {0}".format(n) for n in range(200))
    rdd.saveAsTextFile(fn)
    rdd_check = Context().textFile(fn)

    assert (rdd.count() == rdd_check.count()
            and all(e1 == e2
                    for e1, e2 in zip(rdd.collect(), rdd_check.collect())))
Esempio n. 14
0
def test_s3_textFile():
    myrdd = Context().textFile(
        's3n://aws-publicdatasets/common-crawl/crawl-data/'
        'CC-MAIN-2015-11/warc.paths.*')
    assert (
        'common-crawl/crawl-data/CC-MAIN-2015-11/segments/1424937481488.49/'
        'warc/CC-MAIN-20150226075801-00329-ip-10-28-5-156.ec2.'
        'internal.warc.gz' in myrdd.collect())
Esempio n. 15
0
def test_hdfs_textFile_loop():
    if not HDFS_TEST_PATH:
        raise SkipTest

    random.seed()

    fn = HDFS_TEST_PATH+'/pysparkling_test_{0}.txt'.format(
        int(random.random()*999999.0)
    )

    rdd = Context().parallelize('Hello World {0}'.format(x) for x in range(10))
    rdd.saveAsTextFile(fn)
    read_rdd = Context().textFile(fn)
    assert (
        rdd.count() == read_rdd.count() and
        all(r1 == r2 for r1, r2 in zip(rdd.collect(), read_rdd.collect()))
    )
Esempio n. 16
0
def test_multiprocessing():
    p = multiprocessing.Pool(4)
    c = Context(pool=p, serializer=cloudpickle.dumps,
                deserializer=pickle.loads)
    my_rdd = c.parallelize([1, 3, 4])
    r = my_rdd.map(lambda x: x*x).collect()
    print(r)
    assert 16 in r
Esempio n. 17
0
def test_saveAsTextFile():
    tempFile = tempfile.NamedTemporaryFile(delete=True)
    tempFile.close()
    Context().parallelize(range(10)).saveAsTextFile(tempFile.name)
    with open(tempFile.name, 'r') as f:
        r = f.readlines()
        print(r)
        assert '5\n' in r
Esempio n. 18
0
def test_hdfs_file_exists():
    random.seed()

    fn1 = f'{HDFS_TEST_PATH}/pysparkling_test_{random.random() * 999999.0:d}.txt'
    fn2 = f'{HDFS_TEST_PATH}/pysparkling_test_{random.random() * 999999.0:d}.txt'

    rdd = Context().parallelize(f'Hello World {x}' for x in range(10))
    rdd.saveAsTextFile(fn1)

    assert File(fn1).exists() and not File(fn2).exists()
Esempio n. 19
0
def test_cache():
    # this crashes in version 0.2.28
    lines = Context().textFile('tests/*textFil*.py')
    lines = lines.map(lambda l: '-' + l).cache()
    print(len(lines.collect()))
    lines = lines.map(lambda l: '+' + l)
    lines = lines.map(lambda l: '-' + l).cache()
    lines = lines.collect()
    print(lines)
    assert '-+-from pysparkling import Context' in lines
Esempio n. 20
0
def test_hdfs_file_exists():
    random.seed()

    fn1 = '{}/pysparkling_test_{:d}.txt'.format(HDFS_TEST_PATH,
                                                random.random() * 999999.0)
    fn2 = '{}/pysparkling_test_{:d}.txt'.format(HDFS_TEST_PATH,
                                                random.random() * 999999.0)

    rdd = Context().parallelize('Hello World {0}'.format(x) for x in range(10))
    rdd.saveAsTextFile(fn1)

    assert File(fn1).exists() and not File(fn2).exists()
Esempio n. 21
0
def test_lazy_execution_threadpool():
    def indent_line(l):
        return '--- ' + l

    with futures.ThreadPoolExecutor(4) as p:
        r = Context(pool=p).textFile('tests/test_multiprocessing.py')
        r = r.map(indent_line).cache()
        r.collect()
        r = r.map(indent_line)
        r = r.collect()
        # ThreadPool is not lazy although it returns generators.
        print(r)
        assert '--- --- from pysparkling import Context' in r
Esempio n. 22
0
def run_feature_extraction():
    start_time = time.time()
    desc = 'Feature Extraction for Images'
    parser = argparse.ArgumentParser(
        description=desc,
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=desc)
    default_path = '/media/chris/cschulze_external_4tb/receipt_classifier_images/nonreceipts/train2014'
    # default_path = '/media/chris/cschulze_external_4tb/elliot_data/train_nonpill'
    # default_path = '/train_nonpill'

    parser.add_argument("--input_dir",
                        help="input directory",
                        default=default_path)
    parser.add_argument("--output",
                        help="output file",
                        default='image_features')
    args = parser.parse_args()
    # serialize and put all images in rdd:
    # use json schema:
    #     "image_name": "",
    #     "bytes": ""
    #     "features": "array[]"
    image_dir_path = args.input_dir
    df, data_arr = serialize_and_make_df(image_dir_path)
    print df.head()
    print df.info()

    # df to df_cvs:
    csv_df_file = 'dataframe_csv_file.csv'
    json_df_file = 'dataframe_csv_file.json'
    df.to_csv(csv_df_file, header=False, index=False)
    # df.to_json(json_df_file)

    # rdd from df_csv
    # pysparkling:
    sc = Context()

    # pyspark:
    # conf = SparkConf().setAppName("HOG and GIST ETL")
    # sc = SparkContext(conf=conf)

    # rdd = sc.textFile(json_df_file)
    num_parts = 4
    rdd = sc.parallelize(data_arr, num_parts)
    # submit image rdd to processing
    rdd_features = rdd.map(get_features).coalesce(1)
    # save as txt file:
    rdd_features.map(dump).saveAsTextFile(args.output)
    print "------------------ %f minutes elapsed ------------------------" % (
        (time.time() - start_time) / 60.0)
Esempio n. 23
0
def test_processpool_distributed_cache():
    with futures.ProcessPoolExecutor(4) as p:
        r = Context(
            pool=p,
            serializer=cloudpickle.dumps,
            deserializer=pickle.loads,
        ).parallelize(range(3), 3)
        r = r.map(lambda _: time.sleep(0.1)).cache()
        r.collect()

        time_start = time.time()
        print(r.collect())
        time_end = time.time()
        assert time_end - time_start < 0.3
Esempio n. 24
0
def test_hdfs_file_exists():
    if not HDFS_TEST_PATH:
        raise SkipTest

    random.seed()

    fn1 = HDFS_TEST_PATH + '/pysparkling_test_{0}.txt'.format(
        int(random.random() * 999999.0))
    fn2 = HDFS_TEST_PATH + '/pysparkling_test_{0}.txt'.format(
        int(random.random() * 999999.0))

    rdd = Context().parallelize('Hello World {0}'.format(x) for x in range(10))
    rdd.saveAsTextFile(fn1)

    assert File(fn1).exists() and not File(fn2).exists()
Esempio n. 25
0
def test_lazy_execution_processpool():
    def indent_line(l):
        return '--- ' + l

    with futures.ProcessPoolExecutor(4) as p:
        r = Context(
            pool=p,
            serializer=cloudpickle.dumps,
            deserializer=pickle.loads,
        ).textFile('tests/test_multiprocessing.py')  # .take(10)
        print(r.collect())
        r = r.map(indent_line)
        print(r.collect())
        r = r.cache()
        print(r.collect())
        r = r.map(indent_line)
        r = r.collect()
        # ProcessPool is not lazy although it returns generators.
        print(r)
        assert '--- --- from pysparkling import Context' in r
Esempio n. 26
0
def test_lazy_execution():
    class I(object):
        def __init__(self):
            self.executed = False

        def indent_line(self, l):
            # global indent_was_executed
            self.executed = True
            return '--- ' + l

    r = Context().textFile('tests/test_multiprocessing.py')
    i = I()

    r = r.map(i.indent_line)
    exec_before_collect = i.executed
    # at this point, no map() or foreach() should have been executed
    r = r.map(i.indent_line).cache()
    print(r.collect())
    r = r.map(i.indent_line)
    r.collect()
    exec_after_collect = i.executed
    print((exec_before_collect, exec_after_collect))
    assert not exec_before_collect and exec_after_collect
Esempio n. 27
0
def test_wholeTextFiles():
    t = Context().wholeTextFiles('tests/*.py').lookup('tests/test_textFile.py')
    print(t)
    assert 'test_wholeTextFiles' in t[0]
Esempio n. 28
0
def test_local_textFile_name():
    name = Context().textFile('tests/*.py').name()
    print(name)
    assert name == 'tests/*.py'
Esempio n. 29
0
def test_local_textFile_2():
    line_count = Context().textFile('tests/*.py').count()
    print(line_count)
    assert line_count > 90
Esempio n. 30
0
def test_local_textFile_1():
    lines = Context().textFile('tests/*textFil*.py').collect()
    print(lines)
    assert 'from pysparkling import Context' in lines