Python SparkContext.map Exemples

Langage de programmation: Python

Espace de nommage/Pack: pyspark

Class/Type: SparkContext

Méthode/Fonction: map

Exemples au hotexamples.com: 4

Python SparkContext.map - 4 exemples trouvés. Ce sont les exemples réels les mieux notés de pyspark.SparkContext.map extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Méthodes fréquemment utilisées

Afficher Cacher

setLogLevel(30)

setSystemProperty(30)

setCheckpointDir(30)

getConf(30)

parallelize(30)

pickleFile(30)

broadcast(30)

emptyRDD(30)

newAPIHadoopFile(30)

binaryFiles(30)

addPyFile(30)

addFile(30)

accumulator(30)

getOrCreate(30)

SparkContext(30)

sequenceFile(30)

newAPIHadoopRDD(25)

_ensure_initialized(14)

createDataFrame(11)

hadoopFile(10)

show_profiles(9)

range(8)

dump_profiles(6)

mongoRDD(6)

binaryRecords(6)

map(4)

setLocalProperty(3)

runJob(3)

flatMap(2)

cassandraTable(2)

collect(2)

close(2)

setJobGroup(2)

paralellize(1)

neo4jTable(1)

neo4jConfig(1)

parallelise(1)

BSONFileRDD(1)

parallelized(1)

parallize(1)

reduceByKey(1)

sample(1)

mongoPairRDD(1)

setMaster(1)

show_profile(1)

sortBy(1)

saveAsTextFile(1)

hadoopConfiguration(1)

mixin(1)

filter(1)

Méthodes fréquemment utilisées

setLogLevel (30)

setSystemProperty (30)

setCheckpointDir (30)

getConf (30)

parallelize (30)

pickleFile (30)

broadcast (30)

emptyRDD (30)

newAPIHadoopFile (30)

binaryFiles (30)

Méthodes fréquemment utilisées

addPyFile (30)

addFile (30)

accumulator (30)

getOrCreate (30)

SparkContext (30)

sequenceFile (30)

newAPIHadoopRDD (25)

_ensure_initialized (14)

createDataFrame (11)

hadoopFile (10)

show_profiles (9)

range (8)

dump_profiles (6)

mongoRDD (6)

binaryRecords (6)

map (4)

setLocalProperty (3)

runJob (3)

flatMap (2)

cassandraTable (2)

Méthodes fréquemment utilisées

show_profiles (9)

range (8)

dump_profiles (6)

mongoRDD (6)

binaryRecords (6)

map (4)

setLocalProperty (3)

runJob (3)

flatMap (2)

cassandraTable (2)

collect (2)

close (2)

setJobGroup (2)

paralellize (1)

neo4jTable (1)

neo4jConfig (1)

parallelise (1)

BSONFileRDD (1)

parallelized (1)

parallize (1)

reduceByKey (1)

sample (1)

mongoPairRDD (1)

setMaster (1)

show_profile (1)

sortBy (1)

saveAsTextFile (1)

hadoopConfiguration (1)

mixin (1)

filter (1)

Méthodes fréquemment utilisées

collect (2)

close (2)

setJobGroup (2)

paralellize (1)

neo4jTable (1)

neo4jConfig (1)

parallelise (1)

BSONFileRDD (1)

parallelized (1)

parallize (1)

reduceByKey (1)

sample (1)

mongoPairRDD (1)

setMaster (1)

show_profile (1)

sortBy (1)

saveAsTextFile (1)

hadoopConfiguration (1)

mixin (1)

filter (1)

TextFile (1)

__init__ (1)

__name__ (1)

awaitTermination (1)

braodcast (1)

broacast (1)

cancelAllJobs (1)

cancelJobGroup (1)

count (1)

countByKey (1)

countByValue (1)

first (1)

mapPartitionsWithIndex (1)

fitsData (1)

fitsFiles (1)

getLocalProperty (1)

getNumPartitions (1)

groupBy (1)

groupByKey (1)

SequenceFile (1)

Exemple #1

0

Afficher le fichier

Fichier : job.py Projet : Geralt0714/TaxiDataAnalysis

from pyspark import SparkContext distFile = SparkContext().textFile("Datacleaned/yellow_2014.csv") def myMapFunc(line): line = line.split(",") temp = float(line[13]) if temp < 15: return ('1', 1) if (temp >= 15 and temp < 40): return ('2', 1) if (temp >= 40 and temp < 70): return ('3', 1) if (temp >= 70 and temp < 100): return ('4', 1) if temp >= 100: return ('5', 1) pairs = distFile.map(myMapFunc) # counts = pairs.reduceByKey(lambda x : (x[0],sum(x[1:]))) pairs.saveAsTextFile('result.txt')

Exemple #2

0

Afficher le fichier

Fichier : Statistics.py Projet : AaronZLT/RecommendSys

from pyspark import SparkContext inputFile = '/home/zkpk/Desktop/Code/pythonproject/ml-100k/u.user' user_data = SparkContext('local', 'Statistics') user_fields = user_data.map(lambda line: line.split("|")) num_users = user_fields.map(lambda fields: fields[0]).count() num_genders = user_fields.map(lambda fields: fields[2]).distinct().count() num_occupations = user_fields.map(lambda fields: fields[3]).distinct().count() num_zipcodes = user_fields.map(lambda fields: fields[4]).distinct().count() print "user number: ", num_users

Exemple #3

0

Afficher le fichier

class TestRDD(object): def setup_method(self): self.rdd = SparkContext().parallelize([('a', 7), ('a', 2), ('b', 2)]) # --------------- # Transformations # --------------- def test_map(self): res = self.rdd.map(lambda x: x[0]).collect() assert res == ['a', 'a', 'b'] def test_flatMap(self): res = self.rdd.flatMap(lambda x: x).collect() assert res == ['a', 7, 'a', 2, 'b', 2] def test_mapValues(self): res = self.rdd.groupByKey().mapValues(sum).collect() def test_filter(self): res = self.rdd.filter(lambda x: x[0] == 'a').collect() assert res == [('a', 7), ('a', 2)] def test_keys(self): res = self.rdd.keys().collect() assert res == ['a', 'a', 'b'] def test_values(self): res = self.rdd.values().collect() assert res == [7, 2, 2] def test_sample(self): res = self.rdd.sample(False, 0.5, 7) assert res.count() == 1 res = self.rdd.sample(False, 0.7, 7) assert res.count() == 2 def test_groupBy(self): res = self.rdd.groupBy(lambda x: x[1]) assert res.collect() == [(7, [('a', 7)]), (2, [('a', 2), ('b', 2)])] def test_groupByKey(self): res = self.rdd.groupByKey() assert res.collect() == [('a', [7, 2]), ('b', [2])] def test_reduceByKey(self): res = self.rdd.reduceByKey(sum) assert res.collect() == [('a', 9), ('b', 2)] def test_sortBy(self): res = self.rdd.sortBy(lambda x: x[1]) assert res.collect() == [('a', 2), ('b', 2), ('a', 7)] def test_sortByKey(self): res = self.rdd.sortByKey() assert res.collect() == [('a', 7), ('a', 2), ('b', 2)] # ------- # Actions # ------- def test_getNumPartitions(self): assert self.rdd.getNumPartitions() == 10 def test_collect(self): assert self.rdd.collect() == [('a', 7), ('a', 2), ('b', 2)] def test_count(self): assert self.rdd.count() == 3 def test_countByValue(self): assert self.rdd.countByValue() == { ('a', 7): 1, ('a', 2): 1, ('b', 2): 1 } def test_countByKey(self): assert self.rdd.countByKey() == {'a': 2, 'b': 1} def test_isEmpty(self): assert self.rdd.isEmpty() == False def test_sum(self): assert self.rdd.values().sum() == 11 def test_max(self): assert self.rdd.values().max() == 7 def test_min(self): assert self.rdd.values().min() == 2 def test_mean(self): assert self.rdd.values().mean() == pytest.approx(3.66, 0.01) def test_stdev(self): assert self.rdd.values().stdev() == pytest.approx(2.35, 0.01) def test_variance(self): assert self.rdd.values().variance() == pytest.approx(5.55, 0.01) def test_first(self): assert self.rdd.first() == ('a', 7) def test_take(self): assert self.rdd.take(2) == [('a', 7), ('a', 2)] def test_top(self): assert self.rdd.values().top(1) == [7] def test_foreach(self): # Very difficult to test foreach because it returns None pass def test_reduce(self): assert self.rdd.values().reduce(lambda x, y: x + y) == 11 def test_saveAsTextFile(self): filename = 'output.txt' self.rdd.saveAsTextFile(filename) with open(filename, 'r') as f: lines = f.read().splitlines() assert lines[0] == 'a,7' os.remove(filename)

Exemple #4

0

Afficher le fichier

Fichier : movie-similarities-1m.py Projet : tomchen29/Spark

for ratingX, ratingY in ratingPairs: sum_xx += ratingX * ratingX sum_yy += ratingY * ratingY sum_xy += ratingX * ratingY numPairs += 1 numerator = sum_xy denominator = sqrt(sum_xx) * sqrt(sum_yy) score = (numerator / (float(denominator))) if (denominator) else 0 return (score, numPairs) #use Spark built-in cluster manager to treat very laptop's core as a node print("\nLoading movie names...") #build a SparkContext nd create ratings: [user_ID, (movieID, rating)] data = SparkContext(conf = SparkConf()).textFile("source/ratings.dat") ratings = data.map(lambda l: l.split()).map(lambda l: (int(l[0]), (int(l[1]), float(l[2])))) # Emit every movie rated together by the same user. # Self-join to find every combination. joinedRatings = ratings.join(ratings) #[_user_ID, ((movieID1, rating1), (movieID2, rating2))] # Filter out duplicate pairs. filterDUplicates is a function that returns True of False uniqueJoinedRatings = joinedRatings.filter(filterDuplicates) # Now key by movie pairs: [(movie1, movie2), (rating1, rating2)] moviePairs = uniqueJoinedRatings.map(makePairs) # We now have (movie1, movie2) => (rating1, rating2) # Now collect all ratings for each movie pair and compute similarity moviePairRatings = moviePairs.groupByKey() #[(movie1, movie2), ((rating1, rating2), (rating1, rating2) ...)]