Esempi in Python per Context.Context

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: pysparkling

Classe/tipologia: Context

Metodo/funzione: Context

Esempi su hotexamples.com: 30

Context.Context in Python: 30 esempi trovati. Questi sono i migliori esempi reali in Python per pysparkling.Context.Context, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

Context(30)

collect(23)

saveAsTextFile(10)

count(10)

parallelize(9)

map(6)

textFile(3)

mean(2)

foreach(2)

lookup(2)

startswith(2)

takeSample(2)

take(2)

union(2)

first(2)

filter(2)

toLocalIterator(1)

top(1)

pipe(1)

sum(1)

subtract(1)

zip(1)

cartesian(1)

sample(1)

rightOuterJoin(1)

reduceByKey(1)

reduce(1)

countByValue(1)

persist(1)

groupBy(1)

flatMap(1)

flatMapValues(1)

fold(1)

foldByKey(1)

foreachPartition(1)

getNumPartitions(1)

histogram(1)

countByKey(1)

intersection(1)

join(1)

keyBy(1)

leftOuterJoin(1)

cache(1)

mapPartitions(1)

max(1)

zipWithUniqueId(1)

Esempio n. 1

Mostra file

def test_saveAsTextFile_zip():
    tempFile = tempfile.NamedTemporaryFile(delete=True)
    tempFile.close()
    Context().parallelize(range(10)).saveAsTextFile(tempFile.name + '.zip')
    read_rdd = Context().textFile(tempFile.name + '.zip')
    print(read_rdd.collect())
    assert '5' in read_rdd.collect()

Esempio n. 2

Mostra file

def test_local_regex_read():
    # was not working before 0.3.19
    tempFile = tempfile.NamedTemporaryFile(delete=True)
    tempFile.close()
    Context().parallelize(range(30), 30).saveAsTextFile(tempFile.name)
    d = Context().textFile(tempFile.name + '/part-0000*').collect()
    print(d)
    assert len(d) == 10

Esempio n. 3

Mostra file

    def create_context(n_processes=0):
        if not n_processes:
            return Context()

        p = futures.ProcessPoolExecutor(n_processes)
        return Context(
            pool=p,
            serializer=cloudpickle.dumps,
            # serializer=pickle.dumps,
            deserializer=pickle.loads,
        )

Esempio n. 4

Mostra file

def test_s3_textFile_loop():
    random.seed()

    fn = f'{S3_TEST_PATH}/pysparkling_test_{random.random() * 999999.0:d}.txt'

    rdd = Context().parallelize(f'Line {n}' for n in range(200))
    rdd.saveAsTextFile(fn)
    rdd_check = Context().textFile(fn)

    assert (rdd.count() == rdd_check.count()
            and all(e1 == e2
                    for e1, e2 in zip(rdd.collect(), rdd_check.collect())))

Esempio n. 5

Mostra file

File: test_textFile.py Progetto: szdbl/pysparkling

def test_gs_textFile_loop():
    random.seed()

    fn = '{}/pysparkling_test_{:d}.txt'.format(GS_TEST_PATH,
                                               random.random() * 999999.0)

    rdd = Context().parallelize('Line {0}'.format(n) for n in range(200))
    rdd.saveAsTextFile(fn)
    rdd_check = Context().textFile(fn)

    assert (rdd.count() == rdd_check.count()
            and all(e1 == e2
                    for e1, e2 in zip(rdd.collect(), rdd_check.collect())))

Esempio n. 6

Mostra file

def test_hdfs_textFile_loop():
    random.seed()

    fn = f'{HDFS_TEST_PATH}/pysparkling_test_{random.random() * 999999.0:d}.txt'
    print(f'HDFS test file: {fn}')

    rdd = Context().parallelize(f'Hello World {x}' for x in range(10))
    rdd.saveAsTextFile(fn)
    read_rdd = Context().textFile(fn)
    print(rdd.collect())
    print(read_rdd.collect())
    assert (rdd.count() == read_rdd.count()
            and all(r1 == r2
                    for r1, r2 in zip(rdd.collect(), read_rdd.collect())))

Esempio n. 7

Mostra file

File: test_textFile.py Progetto: szdbl/pysparkling

def test_hdfs_textFile_loop():
    random.seed()

    fn = '{}/pysparkling_test_{:d}.txt'.format(HDFS_TEST_PATH,
                                               random.random() * 999999.0)
    print('HDFS test file: {0}'.format(fn))

    rdd = Context().parallelize('Hello World {0}'.format(x) for x in range(10))
    rdd.saveAsTextFile(fn)
    read_rdd = Context().textFile(fn)
    print(rdd.collect())
    print(read_rdd.collect())
    assert (rdd.count() == read_rdd.count()
            and all(r1 == r2
                    for r1, r2 in zip(rdd.collect(), read_rdd.collect())))

Esempio n. 8

Mostra file

def test_read_7z():
    # file was created with:
    # 7z a tests/data.7z tests/readme_example.py
    # (brew install p7zip)
    rdd = Context().textFile('tests/data.7z')
    print(rdd.collect())
    assert 'from pysparkling import Context' in rdd.collect()

Esempio n. 9

Mostra file

File: test_kmeans.py Progetto: kasper189/pykmeans

    def test_algorith_excution(self):
        """Tests the algorithm execution with basic parameters
        """
        folder_path = os.path.dirname(os.path.realpath(__file__))
        json_path = os.path.join(folder_path, 'data', 'objects.json')

        dataset = Context(
            serializer=cloudpickle.dumps,
            deserializer=pickle.loads,
        ).textFile(json_path).map(json.loads)

        dataset.persist()

        number_of_cluster = 2

        algorithm_settings = settings.ClusteringSetting(
            number_of_cluster, 2, 0.01, AlgorithmProvider.random_distance)

        labeled, centroids = kmeans.compute_cluster(
            dataset, algorithm_settings, AlgorithmProvider.random_sampling,
            AlgorithmProvider.random_cluster, AlgorithmProvider.dummy_update)

        labeled.collect()
        self.assertEqual(10, labeled.count())

        first_item = labeled.first()
        self.assertTrue(0 <= first_item[0] < 2)
        self.assertEqual(number_of_cluster, len(centroids))

Esempio n. 10

Mostra file

 def test_session_storage_level(self):
     spark = SparkSession(Context())
     df = spark.range(4, numPartitions=2)
     self.assertEqual(repr(df.storageLevel), repr(StorageLevel(False, False, False, False, 1)))
     persisted_df = df.persist()
     self.assertEqual(persisted_df.is_cached, True)
     self.assertEqual(repr(persisted_df.storageLevel), repr(StorageLevel.MEMORY_ONLY))

Esempio n. 11

Mostra file

File: test_multiprocessing.py Progetto: ainkov/pysparkling

def test_first_mp():
    p = multiprocessing.Pool(4)
    c = Context(pool=p, serializer=cloudpickle.dumps,
                deserializer=pickle.loads)
    my_rdd = c.parallelize([1, 2, 2, 4, 1, 3, 5, 9], 3)
    print(my_rdd.first())
    assert my_rdd.first() == 1

Esempio n. 12

Mostra file

def test_s3_textFile_loop():
    if not AWS_ACCESS_KEY_ID or not S3_TEST_PATH:
        raise SkipTest

    random.seed()

    fn = '{}/pysparkling_test_{0}.txt'.format(S3_TEST_PATH,
                                              int(random.random() * 999999.0))

    rdd = Context().parallelize("Line {0}".format(n) for n in range(200))
    rdd.saveAsTextFile(fn)
    rdd_check = Context().textFile(fn)

    assert (rdd.count() == rdd_check.count()
            and all(e1 == e2
                    for e1, e2 in zip(rdd.collect(), rdd_check.collect())))

Esempio n. 13

Mostra file

def test_gs_textFile_loop():
    if not OAUTH2_CLIENT_ID or not GS_TEST_PATH:
        raise SkipTest

    random.seed()

    fn = GS_TEST_PATH + '/pysparkling_test_{0}.txt'.format(
        int(random.random() * 999999.0))

    rdd = Context().parallelize("Line {0}".format(n) for n in range(200))
    rdd.saveAsTextFile(fn)
    rdd_check = Context().textFile(fn)

    assert (rdd.count() == rdd_check.count()
            and all(e1 == e2
                    for e1, e2 in zip(rdd.collect(), rdd_check.collect())))

Esempio n. 14

Mostra file

def test_s3_textFile():
    myrdd = Context().textFile(
        's3n://aws-publicdatasets/common-crawl/crawl-data/'
        'CC-MAIN-2015-11/warc.paths.*')
    assert (
        'common-crawl/crawl-data/CC-MAIN-2015-11/segments/1424937481488.49/'
        'warc/CC-MAIN-20150226075801-00329-ip-10-28-5-156.ec2.'
        'internal.warc.gz' in myrdd.collect())

Esempio n. 15

Mostra file

def test_hdfs_textFile_loop():
    if not HDFS_TEST_PATH:
        raise SkipTest

    random.seed()

    fn = HDFS_TEST_PATH+'/pysparkling_test_{0}.txt'.format(
        int(random.random()*999999.0)
    )

    rdd = Context().parallelize('Hello World {0}'.format(x) for x in range(10))
    rdd.saveAsTextFile(fn)
    read_rdd = Context().textFile(fn)
    assert (
        rdd.count() == read_rdd.count() and
        all(r1 == r2 for r1, r2 in zip(rdd.collect(), read_rdd.collect()))
    )

Esempio n. 16

Mostra file

File: test_multiprocessing.py Progetto: ainkov/pysparkling

def test_multiprocessing():
    p = multiprocessing.Pool(4)
    c = Context(pool=p, serializer=cloudpickle.dumps,
                deserializer=pickle.loads)
    my_rdd = c.parallelize([1, 3, 4])
    r = my_rdd.map(lambda x: x*x).collect()
    print(r)
    assert 16 in r

Esempio n. 17

Mostra file

def test_saveAsTextFile():
    tempFile = tempfile.NamedTemporaryFile(delete=True)
    tempFile.close()
    Context().parallelize(range(10)).saveAsTextFile(tempFile.name)
    with open(tempFile.name, 'r') as f:
        r = f.readlines()
        print(r)
        assert '5\n' in r

Esempio n. 18

Mostra file

def test_hdfs_file_exists():
    random.seed()

    fn1 = f'{HDFS_TEST_PATH}/pysparkling_test_{random.random() * 999999.0:d}.txt'
    fn2 = f'{HDFS_TEST_PATH}/pysparkling_test_{random.random() * 999999.0:d}.txt'

    rdd = Context().parallelize(f'Hello World {x}' for x in range(10))
    rdd.saveAsTextFile(fn1)

    assert File(fn1).exists() and not File(fn2).exists()

Esempio n. 19

Mostra file

def test_cache():
    # this crashes in version 0.2.28
    lines = Context().textFile('tests/*textFil*.py')
    lines = lines.map(lambda l: '-' + l).cache()
    print(len(lines.collect()))
    lines = lines.map(lambda l: '+' + l)
    lines = lines.map(lambda l: '-' + l).cache()
    lines = lines.collect()
    print(lines)
    assert '-+-from pysparkling import Context' in lines

Esempio n. 20

Mostra file

File: test_textFile.py Progetto: szdbl/pysparkling

def test_hdfs_file_exists():
    random.seed()

    fn1 = '{}/pysparkling_test_{:d}.txt'.format(HDFS_TEST_PATH,
                                                random.random() * 999999.0)
    fn2 = '{}/pysparkling_test_{:d}.txt'.format(HDFS_TEST_PATH,
                                                random.random() * 999999.0)

    rdd = Context().parallelize('Hello World {0}'.format(x) for x in range(10))
    rdd.saveAsTextFile(fn1)

    assert File(fn1).exists() and not File(fn2).exists()

Esempio n. 21

Mostra file

def test_lazy_execution_threadpool():
    def indent_line(l):
        return '--- ' + l

    with futures.ThreadPoolExecutor(4) as p:
        r = Context(pool=p).textFile('tests/test_multiprocessing.py')
        r = r.map(indent_line).cache()
        r.collect()
        r = r.map(indent_line)
        r = r.collect()
        # ThreadPool is not lazy although it returns generators.
        print(r)
        assert '--- --- from pysparkling import Context' in r

Esempio n. 22

Mostra file

def run_feature_extraction():
    start_time = time.time()
    desc = 'Feature Extraction for Images'
    parser = argparse.ArgumentParser(
        description=desc,
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=desc)
    default_path = '/media/chris/cschulze_external_4tb/receipt_classifier_images/nonreceipts/train2014'
    # default_path = '/media/chris/cschulze_external_4tb/elliot_data/train_nonpill'
    # default_path = '/train_nonpill'

    parser.add_argument("--input_dir",
                        help="input directory",
                        default=default_path)
    parser.add_argument("--output",
                        help="output file",
                        default='image_features')
    args = parser.parse_args()
    # serialize and put all images in rdd:
    # use json schema:
    #     "image_name": "",
    #     "bytes": ""
    #     "features": "array[]"
    image_dir_path = args.input_dir
    df, data_arr = serialize_and_make_df(image_dir_path)
    print df.head()
    print df.info()

    # df to df_cvs:
    csv_df_file = 'dataframe_csv_file.csv'
    json_df_file = 'dataframe_csv_file.json'
    df.to_csv(csv_df_file, header=False, index=False)
    # df.to_json(json_df_file)

    # rdd from df_csv
    # pysparkling:
    sc = Context()

    # pyspark:
    # conf = SparkConf().setAppName("HOG and GIST ETL")
    # sc = SparkContext(conf=conf)

    # rdd = sc.textFile(json_df_file)
    num_parts = 4
    rdd = sc.parallelize(data_arr, num_parts)
    # submit image rdd to processing
    rdd_features = rdd.map(get_features).coalesce(1)
    # save as txt file:
    rdd_features.map(dump).saveAsTextFile(args.output)
    print "------------------ %f minutes elapsed ------------------------" % (
        (time.time() - start_time) / 60.0)

Esempio n. 23

Mostra file

def test_processpool_distributed_cache():
    with futures.ProcessPoolExecutor(4) as p:
        r = Context(
            pool=p,
            serializer=cloudpickle.dumps,
            deserializer=pickle.loads,
        ).parallelize(range(3), 3)
        r = r.map(lambda _: time.sleep(0.1)).cache()
        r.collect()

        time_start = time.time()
        print(r.collect())
        time_end = time.time()
        assert time_end - time_start < 0.3

Esempio n. 24

Mostra file

def test_hdfs_file_exists():
    if not HDFS_TEST_PATH:
        raise SkipTest

    random.seed()

    fn1 = HDFS_TEST_PATH + '/pysparkling_test_{0}.txt'.format(
        int(random.random() * 999999.0))
    fn2 = HDFS_TEST_PATH + '/pysparkling_test_{0}.txt'.format(
        int(random.random() * 999999.0))

    rdd = Context().parallelize('Hello World {0}'.format(x) for x in range(10))
    rdd.saveAsTextFile(fn1)

    assert File(fn1).exists() and not File(fn2).exists()

Esempio n. 25

Mostra file

def test_lazy_execution_processpool():
    def indent_line(l):
        return '--- ' + l

    with futures.ProcessPoolExecutor(4) as p:
        r = Context(
            pool=p,
            serializer=cloudpickle.dumps,
            deserializer=pickle.loads,
        ).textFile('tests/test_multiprocessing.py')  # .take(10)
        print(r.collect())
        r = r.map(indent_line)
        print(r.collect())
        r = r.cache()
        print(r.collect())
        r = r.map(indent_line)
        r = r.collect()
        # ProcessPool is not lazy although it returns generators.
        print(r)
        assert '--- --- from pysparkling import Context' in r

Esempio n. 26

Mostra file

def test_lazy_execution():
    class I(object):
        def __init__(self):
            self.executed = False

        def indent_line(self, l):
            # global indent_was_executed
            self.executed = True
            return '--- ' + l

    r = Context().textFile('tests/test_multiprocessing.py')
    i = I()

    r = r.map(i.indent_line)
    exec_before_collect = i.executed
    # at this point, no map() or foreach() should have been executed
    r = r.map(i.indent_line).cache()
    print(r.collect())
    r = r.map(i.indent_line)
    r.collect()
    exec_after_collect = i.executed
    print((exec_before_collect, exec_after_collect))
    assert not exec_before_collect and exec_after_collect

Esempio n. 27

Mostra file

def test_wholeTextFiles():
    t = Context().wholeTextFiles('tests/*.py').lookup('tests/test_textFile.py')
    print(t)
    assert 'test_wholeTextFiles' in t[0]

Esempio n. 28

Mostra file

def test_local_textFile_name():
    name = Context().textFile('tests/*.py').name()
    print(name)
    assert name == 'tests/*.py'

Esempio n. 29

Mostra file

def test_local_textFile_2():
    line_count = Context().textFile('tests/*.py').count()
    print(line_count)
    assert line_count > 90

Esempio n. 30

Mostra file

def test_local_textFile_1():
    lines = Context().textFile('tests/*textFil*.py').collect()
    print(lines)
    assert 'from pysparkling import Context' in lines