def test_saveAsTextFile_zip(): tempFile = tempfile.NamedTemporaryFile(delete=True) tempFile.close() Context().parallelize(range(10)).saveAsTextFile(tempFile.name + '.zip') read_rdd = Context().textFile(tempFile.name + '.zip') print(read_rdd.collect()) assert '5' in read_rdd.collect()
def test_local_regex_read(): # was not working before 0.3.19 tempFile = tempfile.NamedTemporaryFile(delete=True) tempFile.close() Context().parallelize(range(30), 30).saveAsTextFile(tempFile.name) d = Context().textFile(tempFile.name + '/part-0000*').collect() print(d) assert len(d) == 10
def create_context(n_processes=0): if not n_processes: return Context() p = futures.ProcessPoolExecutor(n_processes) return Context( pool=p, serializer=cloudpickle.dumps, # serializer=pickle.dumps, deserializer=pickle.loads, )
def test_s3_textFile_loop(): random.seed() fn = f'{S3_TEST_PATH}/pysparkling_test_{random.random() * 999999.0:d}.txt' rdd = Context().parallelize(f'Line {n}' for n in range(200)) rdd.saveAsTextFile(fn) rdd_check = Context().textFile(fn) assert (rdd.count() == rdd_check.count() and all(e1 == e2 for e1, e2 in zip(rdd.collect(), rdd_check.collect())))
def test_gs_textFile_loop(): random.seed() fn = '{}/pysparkling_test_{:d}.txt'.format(GS_TEST_PATH, random.random() * 999999.0) rdd = Context().parallelize('Line {0}'.format(n) for n in range(200)) rdd.saveAsTextFile(fn) rdd_check = Context().textFile(fn) assert (rdd.count() == rdd_check.count() and all(e1 == e2 for e1, e2 in zip(rdd.collect(), rdd_check.collect())))
def test_hdfs_textFile_loop(): random.seed() fn = f'{HDFS_TEST_PATH}/pysparkling_test_{random.random() * 999999.0:d}.txt' print(f'HDFS test file: {fn}') rdd = Context().parallelize(f'Hello World {x}' for x in range(10)) rdd.saveAsTextFile(fn) read_rdd = Context().textFile(fn) print(rdd.collect()) print(read_rdd.collect()) assert (rdd.count() == read_rdd.count() and all(r1 == r2 for r1, r2 in zip(rdd.collect(), read_rdd.collect())))
def test_hdfs_textFile_loop(): random.seed() fn = '{}/pysparkling_test_{:d}.txt'.format(HDFS_TEST_PATH, random.random() * 999999.0) print('HDFS test file: {0}'.format(fn)) rdd = Context().parallelize('Hello World {0}'.format(x) for x in range(10)) rdd.saveAsTextFile(fn) read_rdd = Context().textFile(fn) print(rdd.collect()) print(read_rdd.collect()) assert (rdd.count() == read_rdd.count() and all(r1 == r2 for r1, r2 in zip(rdd.collect(), read_rdd.collect())))
def test_read_7z(): # file was created with: # 7z a tests/data.7z tests/readme_example.py # (brew install p7zip) rdd = Context().textFile('tests/data.7z') print(rdd.collect()) assert 'from pysparkling import Context' in rdd.collect()
def test_algorith_excution(self): """Tests the algorithm execution with basic parameters """ folder_path = os.path.dirname(os.path.realpath(__file__)) json_path = os.path.join(folder_path, 'data', 'objects.json') dataset = Context( serializer=cloudpickle.dumps, deserializer=pickle.loads, ).textFile(json_path).map(json.loads) dataset.persist() number_of_cluster = 2 algorithm_settings = settings.ClusteringSetting( number_of_cluster, 2, 0.01, AlgorithmProvider.random_distance) labeled, centroids = kmeans.compute_cluster( dataset, algorithm_settings, AlgorithmProvider.random_sampling, AlgorithmProvider.random_cluster, AlgorithmProvider.dummy_update) labeled.collect() self.assertEqual(10, labeled.count()) first_item = labeled.first() self.assertTrue(0 <= first_item[0] < 2) self.assertEqual(number_of_cluster, len(centroids))
def test_session_storage_level(self): spark = SparkSession(Context()) df = spark.range(4, numPartitions=2) self.assertEqual(repr(df.storageLevel), repr(StorageLevel(False, False, False, False, 1))) persisted_df = df.persist() self.assertEqual(persisted_df.is_cached, True) self.assertEqual(repr(persisted_df.storageLevel), repr(StorageLevel.MEMORY_ONLY))
def test_first_mp(): p = multiprocessing.Pool(4) c = Context(pool=p, serializer=cloudpickle.dumps, deserializer=pickle.loads) my_rdd = c.parallelize([1, 2, 2, 4, 1, 3, 5, 9], 3) print(my_rdd.first()) assert my_rdd.first() == 1
def test_s3_textFile_loop(): if not AWS_ACCESS_KEY_ID or not S3_TEST_PATH: raise SkipTest random.seed() fn = '{}/pysparkling_test_{0}.txt'.format(S3_TEST_PATH, int(random.random() * 999999.0)) rdd = Context().parallelize("Line {0}".format(n) for n in range(200)) rdd.saveAsTextFile(fn) rdd_check = Context().textFile(fn) assert (rdd.count() == rdd_check.count() and all(e1 == e2 for e1, e2 in zip(rdd.collect(), rdd_check.collect())))
def test_gs_textFile_loop(): if not OAUTH2_CLIENT_ID or not GS_TEST_PATH: raise SkipTest random.seed() fn = GS_TEST_PATH + '/pysparkling_test_{0}.txt'.format( int(random.random() * 999999.0)) rdd = Context().parallelize("Line {0}".format(n) for n in range(200)) rdd.saveAsTextFile(fn) rdd_check = Context().textFile(fn) assert (rdd.count() == rdd_check.count() and all(e1 == e2 for e1, e2 in zip(rdd.collect(), rdd_check.collect())))
def test_s3_textFile(): myrdd = Context().textFile( 's3n://aws-publicdatasets/common-crawl/crawl-data/' 'CC-MAIN-2015-11/warc.paths.*') assert ( 'common-crawl/crawl-data/CC-MAIN-2015-11/segments/1424937481488.49/' 'warc/CC-MAIN-20150226075801-00329-ip-10-28-5-156.ec2.' 'internal.warc.gz' in myrdd.collect())
def test_hdfs_textFile_loop(): if not HDFS_TEST_PATH: raise SkipTest random.seed() fn = HDFS_TEST_PATH+'/pysparkling_test_{0}.txt'.format( int(random.random()*999999.0) ) rdd = Context().parallelize('Hello World {0}'.format(x) for x in range(10)) rdd.saveAsTextFile(fn) read_rdd = Context().textFile(fn) assert ( rdd.count() == read_rdd.count() and all(r1 == r2 for r1, r2 in zip(rdd.collect(), read_rdd.collect())) )
def test_multiprocessing(): p = multiprocessing.Pool(4) c = Context(pool=p, serializer=cloudpickle.dumps, deserializer=pickle.loads) my_rdd = c.parallelize([1, 3, 4]) r = my_rdd.map(lambda x: x*x).collect() print(r) assert 16 in r
def test_saveAsTextFile(): tempFile = tempfile.NamedTemporaryFile(delete=True) tempFile.close() Context().parallelize(range(10)).saveAsTextFile(tempFile.name) with open(tempFile.name, 'r') as f: r = f.readlines() print(r) assert '5\n' in r
def test_hdfs_file_exists(): random.seed() fn1 = f'{HDFS_TEST_PATH}/pysparkling_test_{random.random() * 999999.0:d}.txt' fn2 = f'{HDFS_TEST_PATH}/pysparkling_test_{random.random() * 999999.0:d}.txt' rdd = Context().parallelize(f'Hello World {x}' for x in range(10)) rdd.saveAsTextFile(fn1) assert File(fn1).exists() and not File(fn2).exists()
def test_cache(): # this crashes in version 0.2.28 lines = Context().textFile('tests/*textFil*.py') lines = lines.map(lambda l: '-' + l).cache() print(len(lines.collect())) lines = lines.map(lambda l: '+' + l) lines = lines.map(lambda l: '-' + l).cache() lines = lines.collect() print(lines) assert '-+-from pysparkling import Context' in lines
def test_hdfs_file_exists(): random.seed() fn1 = '{}/pysparkling_test_{:d}.txt'.format(HDFS_TEST_PATH, random.random() * 999999.0) fn2 = '{}/pysparkling_test_{:d}.txt'.format(HDFS_TEST_PATH, random.random() * 999999.0) rdd = Context().parallelize('Hello World {0}'.format(x) for x in range(10)) rdd.saveAsTextFile(fn1) assert File(fn1).exists() and not File(fn2).exists()
def test_lazy_execution_threadpool(): def indent_line(l): return '--- ' + l with futures.ThreadPoolExecutor(4) as p: r = Context(pool=p).textFile('tests/test_multiprocessing.py') r = r.map(indent_line).cache() r.collect() r = r.map(indent_line) r = r.collect() # ThreadPool is not lazy although it returns generators. print(r) assert '--- --- from pysparkling import Context' in r
def run_feature_extraction(): start_time = time.time() desc = 'Feature Extraction for Images' parser = argparse.ArgumentParser( description=desc, formatter_class=argparse.RawDescriptionHelpFormatter, epilog=desc) default_path = '/media/chris/cschulze_external_4tb/receipt_classifier_images/nonreceipts/train2014' # default_path = '/media/chris/cschulze_external_4tb/elliot_data/train_nonpill' # default_path = '/train_nonpill' parser.add_argument("--input_dir", help="input directory", default=default_path) parser.add_argument("--output", help="output file", default='image_features') args = parser.parse_args() # serialize and put all images in rdd: # use json schema: # "image_name": "", # "bytes": "" # "features": "array[]" image_dir_path = args.input_dir df, data_arr = serialize_and_make_df(image_dir_path) print df.head() print df.info() # df to df_cvs: csv_df_file = 'dataframe_csv_file.csv' json_df_file = 'dataframe_csv_file.json' df.to_csv(csv_df_file, header=False, index=False) # df.to_json(json_df_file) # rdd from df_csv # pysparkling: sc = Context() # pyspark: # conf = SparkConf().setAppName("HOG and GIST ETL") # sc = SparkContext(conf=conf) # rdd = sc.textFile(json_df_file) num_parts = 4 rdd = sc.parallelize(data_arr, num_parts) # submit image rdd to processing rdd_features = rdd.map(get_features).coalesce(1) # save as txt file: rdd_features.map(dump).saveAsTextFile(args.output) print "------------------ %f minutes elapsed ------------------------" % ( (time.time() - start_time) / 60.0)
def test_processpool_distributed_cache(): with futures.ProcessPoolExecutor(4) as p: r = Context( pool=p, serializer=cloudpickle.dumps, deserializer=pickle.loads, ).parallelize(range(3), 3) r = r.map(lambda _: time.sleep(0.1)).cache() r.collect() time_start = time.time() print(r.collect()) time_end = time.time() assert time_end - time_start < 0.3
def test_hdfs_file_exists(): if not HDFS_TEST_PATH: raise SkipTest random.seed() fn1 = HDFS_TEST_PATH + '/pysparkling_test_{0}.txt'.format( int(random.random() * 999999.0)) fn2 = HDFS_TEST_PATH + '/pysparkling_test_{0}.txt'.format( int(random.random() * 999999.0)) rdd = Context().parallelize('Hello World {0}'.format(x) for x in range(10)) rdd.saveAsTextFile(fn1) assert File(fn1).exists() and not File(fn2).exists()
def test_lazy_execution_processpool(): def indent_line(l): return '--- ' + l with futures.ProcessPoolExecutor(4) as p: r = Context( pool=p, serializer=cloudpickle.dumps, deserializer=pickle.loads, ).textFile('tests/test_multiprocessing.py') # .take(10) print(r.collect()) r = r.map(indent_line) print(r.collect()) r = r.cache() print(r.collect()) r = r.map(indent_line) r = r.collect() # ProcessPool is not lazy although it returns generators. print(r) assert '--- --- from pysparkling import Context' in r
def test_lazy_execution(): class I(object): def __init__(self): self.executed = False def indent_line(self, l): # global indent_was_executed self.executed = True return '--- ' + l r = Context().textFile('tests/test_multiprocessing.py') i = I() r = r.map(i.indent_line) exec_before_collect = i.executed # at this point, no map() or foreach() should have been executed r = r.map(i.indent_line).cache() print(r.collect()) r = r.map(i.indent_line) r.collect() exec_after_collect = i.executed print((exec_before_collect, exec_after_collect)) assert not exec_before_collect and exec_after_collect
def test_wholeTextFiles(): t = Context().wholeTextFiles('tests/*.py').lookup('tests/test_textFile.py') print(t) assert 'test_wholeTextFiles' in t[0]
def test_local_textFile_name(): name = Context().textFile('tests/*.py').name() print(name) assert name == 'tests/*.py'
def test_local_textFile_2(): line_count = Context().textFile('tests/*.py').count() print(line_count) assert line_count > 90
def test_local_textFile_1(): lines = Context().textFile('tests/*textFil*.py').collect() print(lines) assert 'from pysparkling import Context' in lines