Beispiel #1
0
from pyspark import SparkContext
distFile = SparkContext().textFile("Datacleaned/yellow_2014.csv")


def myMapFunc(line):
    line = line.split(",")
    temp = float(line[13])
    if temp < 15:
        return ('1', 1)
    if (temp >= 15 and temp < 40):
        return ('2', 1)
    if (temp >= 40 and temp < 70):
        return ('3', 1)
    if (temp >= 70 and temp < 100):
        return ('4', 1)
    if temp >= 100:
        return ('5', 1)


pairs = distFile.map(myMapFunc)
# counts = pairs.reduceByKey(lambda x : (x[0],sum(x[1:])))
pairs.saveAsTextFile('result.txt')
Beispiel #2
0
from pyspark import SparkContext
inputFile = '/home/zkpk/Desktop/Code/pythonproject/ml-100k/u.user'
user_data = SparkContext('local', 'Statistics')

user_fields = user_data.map(lambda line: line.split("|"))

num_users = user_fields.map(lambda fields: fields[0]).count()

num_genders = user_fields.map(lambda fields: fields[2]).distinct().count()

num_occupations = user_fields.map(lambda fields: fields[3]).distinct().count()

num_zipcodes = user_fields.map(lambda fields: fields[4]).distinct().count()
print "user number:  ", num_users
Beispiel #3
0
class TestRDD(object):
    def setup_method(self):
        self.rdd = SparkContext().parallelize([('a', 7), ('a', 2), ('b', 2)])

    # ---------------
    # Transformations
    # ---------------
    def test_map(self):
        res = self.rdd.map(lambda x: x[0]).collect()
        assert res == ['a', 'a', 'b']

    def test_flatMap(self):
        res = self.rdd.flatMap(lambda x: x).collect()
        assert res == ['a', 7, 'a', 2, 'b', 2]

    def test_mapValues(self):
        res = self.rdd.groupByKey().mapValues(sum).collect()

    def test_filter(self):
        res = self.rdd.filter(lambda x: x[0] == 'a').collect()
        assert res == [('a', 7), ('a', 2)]

    def test_keys(self):
        res = self.rdd.keys().collect()
        assert res == ['a', 'a', 'b']

    def test_values(self):
        res = self.rdd.values().collect()
        assert res == [7, 2, 2]

    def test_sample(self):
        res = self.rdd.sample(False, 0.5, 7)
        assert res.count() == 1

        res = self.rdd.sample(False, 0.7, 7)
        assert res.count() == 2

    def test_groupBy(self):
        res = self.rdd.groupBy(lambda x: x[1])
        assert res.collect() == [(7, [('a', 7)]), (2, [('a', 2), ('b', 2)])]

    def test_groupByKey(self):
        res = self.rdd.groupByKey()
        assert res.collect() == [('a', [7, 2]), ('b', [2])]

    def test_reduceByKey(self):
        res = self.rdd.reduceByKey(sum)
        assert res.collect() == [('a', 9), ('b', 2)]

    def test_sortBy(self):
        res = self.rdd.sortBy(lambda x: x[1])
        assert res.collect() == [('a', 2), ('b', 2), ('a', 7)]

    def test_sortByKey(self):
        res = self.rdd.sortByKey()
        assert res.collect() == [('a', 7), ('a', 2), ('b', 2)]

    # -------
    # Actions
    # -------
    def test_getNumPartitions(self):
        assert self.rdd.getNumPartitions() == 10

    def test_collect(self):
        assert self.rdd.collect() == [('a', 7), ('a', 2), ('b', 2)]

    def test_count(self):
        assert self.rdd.count() == 3

    def test_countByValue(self):
        assert self.rdd.countByValue() == {
            ('a', 7): 1,
            ('a', 2): 1,
            ('b', 2): 1
        }

    def test_countByKey(self):
        assert self.rdd.countByKey() == {'a': 2, 'b': 1}

    def test_isEmpty(self):
        assert self.rdd.isEmpty() == False

    def test_sum(self):
        assert self.rdd.values().sum() == 11

    def test_max(self):
        assert self.rdd.values().max() == 7

    def test_min(self):
        assert self.rdd.values().min() == 2

    def test_mean(self):
        assert self.rdd.values().mean() == pytest.approx(3.66, 0.01)

    def test_stdev(self):
        assert self.rdd.values().stdev() == pytest.approx(2.35, 0.01)

    def test_variance(self):
        assert self.rdd.values().variance() == pytest.approx(5.55, 0.01)

    def test_first(self):
        assert self.rdd.first() == ('a', 7)

    def test_take(self):
        assert self.rdd.take(2) == [('a', 7), ('a', 2)]

    def test_top(self):
        assert self.rdd.values().top(1) == [7]

    def test_foreach(self):
        # Very difficult to test foreach because it returns None
        pass

    def test_reduce(self):
        assert self.rdd.values().reduce(lambda x, y: x + y) == 11

    def test_saveAsTextFile(self):
        filename = 'output.txt'
        self.rdd.saveAsTextFile(filename)
        with open(filename, 'r') as f:
            lines = f.read().splitlines()
            assert lines[0] == 'a,7'
        os.remove(filename)
    for ratingX, ratingY in ratingPairs:
        sum_xx += ratingX * ratingX
        sum_yy += ratingY * ratingY
        sum_xy += ratingX * ratingY
        numPairs += 1
    numerator = sum_xy
    denominator = sqrt(sum_xx) * sqrt(sum_yy)
    score = (numerator / (float(denominator))) if (denominator) else 0
    return (score, numPairs)

#use Spark built-in cluster manager to treat very laptop's core as a node
print("\nLoading movie names...")

#build a SparkContext nd create ratings: [user_ID, (movieID, rating)]
data = SparkContext(conf = SparkConf()).textFile("source/ratings.dat")
ratings = data.map(lambda l: l.split()).map(lambda l: (int(l[0]), (int(l[1]), float(l[2]))))

# Emit every movie rated together by the same user.
# Self-join to find every combination.
joinedRatings = ratings.join(ratings)  #[_user_ID, ((movieID1, rating1), (movieID2, rating2))]

# Filter out duplicate pairs. filterDUplicates is a function that returns True of False
uniqueJoinedRatings = joinedRatings.filter(filterDuplicates)

# Now key by movie pairs: [(movie1, movie2), (rating1, rating2)]
moviePairs = uniqueJoinedRatings.map(makePairs)

# We now have (movie1, movie2) => (rating1, rating2)
# Now collect all ratings for each movie pair and compute similarity
moviePairRatings = moviePairs.groupByKey()  #[(movie1, movie2), ((rating1, rating2), (rating1, rating2) ...)]