Python SparkContext.collect Beispiele

Programmiersprache: Python

Namespace / Paketname: pyspark

Klasse / Typ: SparkContext

Methode / Funktion: collect

Beispiele auf hotexamples.com: 2

Python SparkContext.collect - 2 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die pyspark.SparkContext.collect, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

setLogLevel(30)

setSystemProperty(30)

setCheckpointDir(30)

getConf(30)

parallelize(30)

pickleFile(30)

broadcast(30)

emptyRDD(30)

newAPIHadoopFile(30)

binaryFiles(30)

addPyFile(30)

addFile(30)

accumulator(30)

getOrCreate(30)

SparkContext(30)

sequenceFile(30)

newAPIHadoopRDD(25)

_ensure_initialized(14)

createDataFrame(11)

hadoopFile(10)

show_profiles(9)

range(8)

dump_profiles(6)

mongoRDD(6)

binaryRecords(6)

map(4)

setLocalProperty(3)

runJob(3)

flatMap(2)

cassandraTable(2)

collect(2)

close(2)

setJobGroup(2)

paralellize(1)

neo4jTable(1)

neo4jConfig(1)

parallelise(1)

BSONFileRDD(1)

parallelized(1)

parallize(1)

reduceByKey(1)

sample(1)

mongoPairRDD(1)

setMaster(1)

show_profile(1)

sortBy(1)

saveAsTextFile(1)

hadoopConfiguration(1)

mixin(1)

filter(1)

Beispiel #1

Datei anzeigen

class TestRDD(object):
    def setup_method(self):
        self.rdd = SparkContext().parallelize([('a', 7), ('a', 2), ('b', 2)])

    # ---------------
    # Transformations
    # ---------------
    def test_map(self):
        res = self.rdd.map(lambda x: x[0]).collect()
        assert res == ['a', 'a', 'b']

    def test_flatMap(self):
        res = self.rdd.flatMap(lambda x: x).collect()
        assert res == ['a', 7, 'a', 2, 'b', 2]

    def test_mapValues(self):
        res = self.rdd.groupByKey().mapValues(sum).collect()

    def test_filter(self):
        res = self.rdd.filter(lambda x: x[0] == 'a').collect()
        assert res == [('a', 7), ('a', 2)]

    def test_keys(self):
        res = self.rdd.keys().collect()
        assert res == ['a', 'a', 'b']

    def test_values(self):
        res = self.rdd.values().collect()
        assert res == [7, 2, 2]

    def test_sample(self):
        res = self.rdd.sample(False, 0.5, 7)
        assert res.count() == 1

        res = self.rdd.sample(False, 0.7, 7)
        assert res.count() == 2

    def test_groupBy(self):
        res = self.rdd.groupBy(lambda x: x[1])
        assert res.collect() == [(7, [('a', 7)]), (2, [('a', 2), ('b', 2)])]

    def test_groupByKey(self):
        res = self.rdd.groupByKey()
        assert res.collect() == [('a', [7, 2]), ('b', [2])]

    def test_reduceByKey(self):
        res = self.rdd.reduceByKey(sum)
        assert res.collect() == [('a', 9), ('b', 2)]

    def test_sortBy(self):
        res = self.rdd.sortBy(lambda x: x[1])
        assert res.collect() == [('a', 2), ('b', 2), ('a', 7)]

    def test_sortByKey(self):
        res = self.rdd.sortByKey()
        assert res.collect() == [('a', 7), ('a', 2), ('b', 2)]

    # -------
    # Actions
    # -------
    def test_getNumPartitions(self):
        assert self.rdd.getNumPartitions() == 10

    def test_collect(self):
        assert self.rdd.collect() == [('a', 7), ('a', 2), ('b', 2)]

    def test_count(self):
        assert self.rdd.count() == 3

    def test_countByValue(self):
        assert self.rdd.countByValue() == {
            ('a', 7): 1,
            ('a', 2): 1,
            ('b', 2): 1
        }

    def test_countByKey(self):
        assert self.rdd.countByKey() == {'a': 2, 'b': 1}

    def test_isEmpty(self):
        assert self.rdd.isEmpty() == False

    def test_sum(self):
        assert self.rdd.values().sum() == 11

    def test_max(self):
        assert self.rdd.values().max() == 7

    def test_min(self):
        assert self.rdd.values().min() == 2

    def test_mean(self):
        assert self.rdd.values().mean() == pytest.approx(3.66, 0.01)

    def test_stdev(self):
        assert self.rdd.values().stdev() == pytest.approx(2.35, 0.01)

    def test_variance(self):
        assert self.rdd.values().variance() == pytest.approx(5.55, 0.01)

    def test_first(self):
        assert self.rdd.first() == ('a', 7)

    def test_take(self):
        assert self.rdd.take(2) == [('a', 7), ('a', 2)]

    def test_top(self):
        assert self.rdd.values().top(1) == [7]

    def test_foreach(self):
        # Very difficult to test foreach because it returns None
        pass

    def test_reduce(self):
        assert self.rdd.values().reduce(lambda x, y: x + y) == 11

    def test_saveAsTextFile(self):
        filename = 'output.txt'
        self.rdd.saveAsTextFile(filename)
        with open(filename, 'r') as f:
            lines = f.read().splitlines()
            assert lines[0] == 'a,7'
        os.remove(filename)

Beispiel #2

Datei anzeigen

import re
from pyspark import SparkConf, SparkContext

def normalizeWords(text):
    return re.compile(r'\W+', re.UNICODE).split(text.lower())

conf = SparkConf().setMaster("local").setAppName("WordCount")
sc = SparkContext(conf = conf)

input = sc.textFile("file:///sparkcourse/book.txt")
words = input.flatMap(normalizeWords)

wordCounts = words.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)
wordCountsSorted = wordCounts.map(lambda x: (x[1], x[0])).sortByKey()
results = wordCountsSorted.collect()

for result in results:
    count = str(result[0])
    word = result[1].encode('ascii', 'ignore')
    if (word):
        print(word.decode() + ":\t\t" + count)

//checkiing for hackto
sc.collect()
sc.map(x: -> error done)