def parseReviewToTrainingSet(reviewRDD, testReviewRDD): reviewRestaRDD = reviewRDD.map(Review.toString).filter(lambda line: Review.is_res(line, restaurantListBC)) userList = reviewRestaRDD.map(Review.getuserid).sortByKey().collect() restList = reviewRestaRDD.map(Review.getbusiid).sortByKey().collect() userListBC = sc.broadcast(userList) restListBC = sc.broadcast(restList) print(userList[10]) ''' Generate Dictionaries of users in training set and broadcast ''' userIdToNumDict, userNumToIdDict = assignNum(userList) userIdToNumDictBC = sc.broadcast(userIdToNumDict) userNumToIdDictBC = sc.broadcast(userNumToIdDict) ''' Generate Dictionaries of Restaurants in training set and broadcast ''' restIdToNumDict, restNumToIdDict = assignNum(restList) restIdToNumDictBC = sc.broadcast(restIdToNumDict) restNumToIdDictBC = sc.broadcast(restNumToIdDict) userReviewRestaRDD = reviewRestaRDD.map(Review.mapper).reduceByKey(Review.reducer).map(Review.reshape) userReviewRestaCollNormRDD = userReviewRestaRDD.map(Review.normalize) # Subtract average values userReviewRestaNormLst = userReviewRestaCollNormRDD.collect() # map(Review.flatten).flatMap(Review.vectorize) userAvgDict = dict(userReviewRestaCollNormRDD.map(lambda x: (x[0], x[1])).collect()) userReviewRestaLst = parseUserBusiLst(userReviewRestaNormLst, userIdToNumDict, restIdToNumDict) userReviewRestaNormRDD = sc.parallelize(userReviewRestaLst) #usrResStarTupleRDD = reviewRestaRDD.map(Review.getUsrResStar) testReviewRestRDD = testReviewRDD.map(Review.toString).map(Review.getUsrResStar) print(userReviewRestaNormRDD.take(10)) testReviewRestRDD = testReviewRestRDD.filter(lambda x: x[0] in userListBC.value and x[1] in restListBC.value)\ .map(lambda x: Review.normalizeStar(userAvgDict, x))\ .map(lambda x: Review.replaceIDwithNum(x, userIdToNumDictBC, restIdToNumDictBC)) #.filter(lambda x: x[0] != 0 and x[1] != 0) print(testReviewRestRDD.take(10)) # and x[1] in restaurantListBC.value)\ return userReviewRestaNormRDD, testReviewRestRDD
import os import itertools import math from pyspark import SparkConf, SparkContext if __name__ == "__main__": conf = SparkConf() \ .setAppName("YelpReviewALS") \ .set("spark.executor.memory", "2g")\ .set("spark.python.worker.memory", "2g") sc = SparkContext('local', conf=conf) businessRDD = sc.textFile("../../../data/yelp_academic_dataset_business.json") sc.setCheckpointDir("checkpoints/") restaurantRDD = businessRDD.map(Business.to_string).filter(Business.is_res) for item in businessRDD.take(10): print(item) restaurantList = restaurantRDD.map(Business.get_id).collect() restBC = sc.broadcast(restaurantList) if os.path.exists("RestaurantReviews"): os.system("rm -rf RestaurantReviews") reviewRDD = sc.textFile("../../../data/yelp_academic_dataset_review_large.json")\ .filter(lambda x: Review.is_res(x, restBC))\ .map(Review.parseRatings)\ .saveAsTextFile("RestaurantReviews")
numRestaurants = ratings.values().map(lambda r: r[1]).distinct().count() print("Got %d ratings from %d users on %d restaurants." % (numRatings, numUsers, numRestaurants)) userList = ratings.values().map(lambda r: r[0]).distinct().collect() restList = ratings.values().map(lambda r: r[1]).distinct().collect() getUserIndex, getUserID = assignNum(userList) getRestIndex, getRestID = assignNum(restList) getUserIDBC = sc.broadcast(getUserID) getUserIndexBC = sc.broadcast(getUserIndex) getRestIndexBC = sc.broadcast(getRestIndex) getRestIDBC = sc.broadcast(getRestID) ratings = ratings.map(lambda (x, y): (x, Review.replaceIDwithNum(y, getUserIndexBC, getRestIndexBC))) usrRatingAvg = ratings.values().map(lambda x: (x[0], x[2])).reduceByKey(Review.reducer).map(Review.reshape)\ .filter(lambda x: len(x[1]) >= 3).map(Review.reshapeList)\ .map(lambda x: (x[0], sum(x[1])/float(len(x[1])))) usrRatingAvgBC = sc.broadcast(dict(usrRatingAvg.collect())) ratings = ratings.filter(lambda x: x[1][0] in usrRatingAvgBC.value).map(lambda (x, y): (x, Review.subtractAvg(y, usrRatingAvgBC))) numOfPartitions = 4 trainingVal = ratings.filter(lambda x: x[0] <= 6) \ .values() trainingMean = trainingVal.map(lambda x: x[2]).mean() training = trainingVal.repartition(numOfPartitions).cache()
similarityEstimate = mat.columnSimilarities().entries if os.path.exists("../../../data/similarities"): os.removedirs("../../../data/similarities") similarityEstimate.saveAsTextFile("../../../data/similarities") #model = PowerIterationClustering.train(similarityEstimate, 2, 10) ''' restaurantSample = restTupleListRDD.take(20) for i in xrange(20): print(restaurantSample[i]) ''' # collect all possible values of every attribute, using accumulator # unfortunately not working.... ''' attributeAccum = sc.accumulator(restAttribSetBC, attributeAccumulatorParam()) restaurantRDD.map(lambda entry: (entry, restAttribSetBC)).foreach(lambda x: attributeAccum.add(x)) print(type(attributeAccum)) ''' reviewRDD = sc.textFile("../../../data/yelp_academic_dataset_review.json") reviewRestaRDD = reviewRDD.map(Review.toString).filter(lambda line: Review.is_res(line, restaurantListBC)) userReviewRestaRDD = reviewRestaRDD.map(Review.mapper).reduceByKey(Review.reducer) userRDD = sc.textFile("../../../data/yelp_academic_dataset_user.json") userRDD = userRDD.map(User.toString).map(User.getFriends) # user friendlist