from pyspark import SparkContext distFile = SparkContext().textFile("Datacleaned/yellow_2014.csv") def myMapFunc(line): line = line.split(",") temp = float(line[13]) if temp < 15: return ('1', 1) if (temp >= 15 and temp < 40): return ('2', 1) if (temp >= 40 and temp < 70): return ('3', 1) if (temp >= 70 and temp < 100): return ('4', 1) if temp >= 100: return ('5', 1) pairs = distFile.map(myMapFunc) # counts = pairs.reduceByKey(lambda x : (x[0],sum(x[1:]))) pairs.saveAsTextFile('result.txt')
from pyspark import SparkContext inputFile = '/home/zkpk/Desktop/Code/pythonproject/ml-100k/u.user' user_data = SparkContext('local', 'Statistics') user_fields = user_data.map(lambda line: line.split("|")) num_users = user_fields.map(lambda fields: fields[0]).count() num_genders = user_fields.map(lambda fields: fields[2]).distinct().count() num_occupations = user_fields.map(lambda fields: fields[3]).distinct().count() num_zipcodes = user_fields.map(lambda fields: fields[4]).distinct().count() print "user number: ", num_users
class TestRDD(object): def setup_method(self): self.rdd = SparkContext().parallelize([('a', 7), ('a', 2), ('b', 2)]) # --------------- # Transformations # --------------- def test_map(self): res = self.rdd.map(lambda x: x[0]).collect() assert res == ['a', 'a', 'b'] def test_flatMap(self): res = self.rdd.flatMap(lambda x: x).collect() assert res == ['a', 7, 'a', 2, 'b', 2] def test_mapValues(self): res = self.rdd.groupByKey().mapValues(sum).collect() def test_filter(self): res = self.rdd.filter(lambda x: x[0] == 'a').collect() assert res == [('a', 7), ('a', 2)] def test_keys(self): res = self.rdd.keys().collect() assert res == ['a', 'a', 'b'] def test_values(self): res = self.rdd.values().collect() assert res == [7, 2, 2] def test_sample(self): res = self.rdd.sample(False, 0.5, 7) assert res.count() == 1 res = self.rdd.sample(False, 0.7, 7) assert res.count() == 2 def test_groupBy(self): res = self.rdd.groupBy(lambda x: x[1]) assert res.collect() == [(7, [('a', 7)]), (2, [('a', 2), ('b', 2)])] def test_groupByKey(self): res = self.rdd.groupByKey() assert res.collect() == [('a', [7, 2]), ('b', [2])] def test_reduceByKey(self): res = self.rdd.reduceByKey(sum) assert res.collect() == [('a', 9), ('b', 2)] def test_sortBy(self): res = self.rdd.sortBy(lambda x: x[1]) assert res.collect() == [('a', 2), ('b', 2), ('a', 7)] def test_sortByKey(self): res = self.rdd.sortByKey() assert res.collect() == [('a', 7), ('a', 2), ('b', 2)] # ------- # Actions # ------- def test_getNumPartitions(self): assert self.rdd.getNumPartitions() == 10 def test_collect(self): assert self.rdd.collect() == [('a', 7), ('a', 2), ('b', 2)] def test_count(self): assert self.rdd.count() == 3 def test_countByValue(self): assert self.rdd.countByValue() == { ('a', 7): 1, ('a', 2): 1, ('b', 2): 1 } def test_countByKey(self): assert self.rdd.countByKey() == {'a': 2, 'b': 1} def test_isEmpty(self): assert self.rdd.isEmpty() == False def test_sum(self): assert self.rdd.values().sum() == 11 def test_max(self): assert self.rdd.values().max() == 7 def test_min(self): assert self.rdd.values().min() == 2 def test_mean(self): assert self.rdd.values().mean() == pytest.approx(3.66, 0.01) def test_stdev(self): assert self.rdd.values().stdev() == pytest.approx(2.35, 0.01) def test_variance(self): assert self.rdd.values().variance() == pytest.approx(5.55, 0.01) def test_first(self): assert self.rdd.first() == ('a', 7) def test_take(self): assert self.rdd.take(2) == [('a', 7), ('a', 2)] def test_top(self): assert self.rdd.values().top(1) == [7] def test_foreach(self): # Very difficult to test foreach because it returns None pass def test_reduce(self): assert self.rdd.values().reduce(lambda x, y: x + y) == 11 def test_saveAsTextFile(self): filename = 'output.txt' self.rdd.saveAsTextFile(filename) with open(filename, 'r') as f: lines = f.read().splitlines() assert lines[0] == 'a,7' os.remove(filename)
for ratingX, ratingY in ratingPairs: sum_xx += ratingX * ratingX sum_yy += ratingY * ratingY sum_xy += ratingX * ratingY numPairs += 1 numerator = sum_xy denominator = sqrt(sum_xx) * sqrt(sum_yy) score = (numerator / (float(denominator))) if (denominator) else 0 return (score, numPairs) #use Spark built-in cluster manager to treat very laptop's core as a node print("\nLoading movie names...") #build a SparkContext nd create ratings: [user_ID, (movieID, rating)] data = SparkContext(conf = SparkConf()).textFile("source/ratings.dat") ratings = data.map(lambda l: l.split()).map(lambda l: (int(l[0]), (int(l[1]), float(l[2])))) # Emit every movie rated together by the same user. # Self-join to find every combination. joinedRatings = ratings.join(ratings) #[_user_ID, ((movieID1, rating1), (movieID2, rating2))] # Filter out duplicate pairs. filterDUplicates is a function that returns True of False uniqueJoinedRatings = joinedRatings.filter(filterDuplicates) # Now key by movie pairs: [(movie1, movie2), (rating1, rating2)] moviePairs = uniqueJoinedRatings.map(makePairs) # We now have (movie1, movie2) => (rating1, rating2) # Now collect all ratings for each movie pair and compute similarity moviePairRatings = moviePairs.groupByKey() #[(movie1, movie2), ((rating1, rating2), (rating1, rating2) ...)]