Python SparkContext.map Beispiele

Programmiersprache: Python

Namespace / Paketname: pyspark

Klasse / Typ: SparkContext

Methode / Funktion: map

Beispiele auf hotexamples.com: 4

Python SparkContext.map - 4 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die pyspark.SparkContext.map, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

setLogLevel(30)

setSystemProperty(30)

setCheckpointDir(30)

getConf(30)

parallelize(30)

pickleFile(30)

broadcast(30)

emptyRDD(30)

newAPIHadoopFile(30)

binaryFiles(30)

addPyFile(30)

addFile(30)

accumulator(30)

getOrCreate(30)

SparkContext(30)

sequenceFile(30)

newAPIHadoopRDD(25)

_ensure_initialized(14)

createDataFrame(11)

hadoopFile(10)

show_profiles(9)

range(8)

dump_profiles(6)

mongoRDD(6)

binaryRecords(6)

map(4)

setLocalProperty(3)

runJob(3)

flatMap(2)

cassandraTable(2)

collect(2)

close(2)

setJobGroup(2)

paralellize(1)

neo4jTable(1)

neo4jConfig(1)

parallelise(1)

BSONFileRDD(1)

parallelized(1)

parallize(1)

reduceByKey(1)

sample(1)

mongoPairRDD(1)

setMaster(1)

show_profile(1)

sortBy(1)

saveAsTextFile(1)

hadoopConfiguration(1)

mixin(1)

filter(1)

Beispiel #1

Datei anzeigen

Datei: job.py Projekt: Geralt0714/TaxiDataAnalysis

from pyspark import SparkContext
distFile = SparkContext().textFile("Datacleaned/yellow_2014.csv")


def myMapFunc(line):
    line = line.split(",")
    temp = float(line[13])
    if temp < 15:
        return ('1', 1)
    if (temp >= 15 and temp < 40):
        return ('2', 1)
    if (temp >= 40 and temp < 70):
        return ('3', 1)
    if (temp >= 70 and temp < 100):
        return ('4', 1)
    if temp >= 100:
        return ('5', 1)


pairs = distFile.map(myMapFunc)
# counts = pairs.reduceByKey(lambda x : (x[0],sum(x[1:])))
pairs.saveAsTextFile('result.txt')

Beispiel #2

Datei anzeigen

Datei: Statistics.py Projekt: AaronZLT/RecommendSys

from pyspark import SparkContext
inputFile = '/home/zkpk/Desktop/Code/pythonproject/ml-100k/u.user'
user_data = SparkContext('local', 'Statistics')

user_fields = user_data.map(lambda line: line.split("|"))

num_users = user_fields.map(lambda fields: fields[0]).count()

num_genders = user_fields.map(lambda fields: fields[2]).distinct().count()

num_occupations = user_fields.map(lambda fields: fields[3]).distinct().count()

num_zipcodes = user_fields.map(lambda fields: fields[4]).distinct().count()
print "user number:  ", num_users

Beispiel #3

Datei anzeigen

class TestRDD(object):
    def setup_method(self):
        self.rdd = SparkContext().parallelize([('a', 7), ('a', 2), ('b', 2)])

    # ---------------
    # Transformations
    # ---------------
    def test_map(self):
        res = self.rdd.map(lambda x: x[0]).collect()
        assert res == ['a', 'a', 'b']

    def test_flatMap(self):
        res = self.rdd.flatMap(lambda x: x).collect()
        assert res == ['a', 7, 'a', 2, 'b', 2]

    def test_mapValues(self):
        res = self.rdd.groupByKey().mapValues(sum).collect()

    def test_filter(self):
        res = self.rdd.filter(lambda x: x[0] == 'a').collect()
        assert res == [('a', 7), ('a', 2)]

    def test_keys(self):
        res = self.rdd.keys().collect()
        assert res == ['a', 'a', 'b']

    def test_values(self):
        res = self.rdd.values().collect()
        assert res == [7, 2, 2]

    def test_sample(self):
        res = self.rdd.sample(False, 0.5, 7)
        assert res.count() == 1

        res = self.rdd.sample(False, 0.7, 7)
        assert res.count() == 2

    def test_groupBy(self):
        res = self.rdd.groupBy(lambda x: x[1])
        assert res.collect() == [(7, [('a', 7)]), (2, [('a', 2), ('b', 2)])]

    def test_groupByKey(self):
        res = self.rdd.groupByKey()
        assert res.collect() == [('a', [7, 2]), ('b', [2])]

    def test_reduceByKey(self):
        res = self.rdd.reduceByKey(sum)
        assert res.collect() == [('a', 9), ('b', 2)]

    def test_sortBy(self):
        res = self.rdd.sortBy(lambda x: x[1])
        assert res.collect() == [('a', 2), ('b', 2), ('a', 7)]

    def test_sortByKey(self):
        res = self.rdd.sortByKey()
        assert res.collect() == [('a', 7), ('a', 2), ('b', 2)]

    # -------
    # Actions
    # -------
    def test_getNumPartitions(self):
        assert self.rdd.getNumPartitions() == 10

    def test_collect(self):
        assert self.rdd.collect() == [('a', 7), ('a', 2), ('b', 2)]

    def test_count(self):
        assert self.rdd.count() == 3

    def test_countByValue(self):
        assert self.rdd.countByValue() == {
            ('a', 7): 1,
            ('a', 2): 1,
            ('b', 2): 1
        }

    def test_countByKey(self):
        assert self.rdd.countByKey() == {'a': 2, 'b': 1}

    def test_isEmpty(self):
        assert self.rdd.isEmpty() == False

    def test_sum(self):
        assert self.rdd.values().sum() == 11

    def test_max(self):
        assert self.rdd.values().max() == 7

    def test_min(self):
        assert self.rdd.values().min() == 2

    def test_mean(self):
        assert self.rdd.values().mean() == pytest.approx(3.66, 0.01)

    def test_stdev(self):
        assert self.rdd.values().stdev() == pytest.approx(2.35, 0.01)

    def test_variance(self):
        assert self.rdd.values().variance() == pytest.approx(5.55, 0.01)

    def test_first(self):
        assert self.rdd.first() == ('a', 7)

    def test_take(self):
        assert self.rdd.take(2) == [('a', 7), ('a', 2)]

    def test_top(self):
        assert self.rdd.values().top(1) == [7]

    def test_foreach(self):
        # Very difficult to test foreach because it returns None
        pass

    def test_reduce(self):
        assert self.rdd.values().reduce(lambda x, y: x + y) == 11

    def test_saveAsTextFile(self):
        filename = 'output.txt'
        self.rdd.saveAsTextFile(filename)
        with open(filename, 'r') as f:
            lines = f.read().splitlines()
            assert lines[0] == 'a,7'
        os.remove(filename)

Beispiel #4

Datei anzeigen

Datei: movie-similarities-1m.py Projekt: tomchen29/Spark

    for ratingX, ratingY in ratingPairs:
        sum_xx += ratingX * ratingX
        sum_yy += ratingY * ratingY
        sum_xy += ratingX * ratingY
        numPairs += 1
    numerator = sum_xy
    denominator = sqrt(sum_xx) * sqrt(sum_yy)
    score = (numerator / (float(denominator))) if (denominator) else 0
    return (score, numPairs)

#use Spark built-in cluster manager to treat very laptop's core as a node
print("\nLoading movie names...")

#build a SparkContext nd create ratings: [user_ID, (movieID, rating)]
data = SparkContext(conf = SparkConf()).textFile("source/ratings.dat")
ratings = data.map(lambda l: l.split()).map(lambda l: (int(l[0]), (int(l[1]), float(l[2]))))

# Emit every movie rated together by the same user.
# Self-join to find every combination.
joinedRatings = ratings.join(ratings)  #[_user_ID, ((movieID1, rating1), (movieID2, rating2))]

# Filter out duplicate pairs. filterDUplicates is a function that returns True of False
uniqueJoinedRatings = joinedRatings.filter(filterDuplicates)

# Now key by movie pairs: [(movie1, movie2), (rating1, rating2)]
moviePairs = uniqueJoinedRatings.map(makePairs)

# We now have (movie1, movie2) => (rating1, rating2)
# Now collect all ratings for each movie pair and compute similarity
moviePairRatings = moviePairs.groupByKey()  #[(movie1, movie2), ((rating1, rating2), (rating1, rating2) ...)]