def parseReviewToTrainingSet(reviewRDD, testReviewRDD): reviewRestaRDD = reviewRDD.map(Review.toString).filter(lambda line: Review.is_res(line, restaurantListBC)) userList = reviewRestaRDD.map(Review.getuserid).sortByKey().collect() restList = reviewRestaRDD.map(Review.getbusiid).sortByKey().collect() userListBC = sc.broadcast(userList) restListBC = sc.broadcast(restList) print(userList[10]) ''' Generate Dictionaries of users in training set and broadcast ''' userIdToNumDict, userNumToIdDict = assignNum(userList) userIdToNumDictBC = sc.broadcast(userIdToNumDict) userNumToIdDictBC = sc.broadcast(userNumToIdDict) ''' Generate Dictionaries of Restaurants in training set and broadcast ''' restIdToNumDict, restNumToIdDict = assignNum(restList) restIdToNumDictBC = sc.broadcast(restIdToNumDict) restNumToIdDictBC = sc.broadcast(restNumToIdDict) userReviewRestaRDD = reviewRestaRDD.map(Review.mapper).reduceByKey(Review.reducer).map(Review.reshape) userReviewRestaCollNormRDD = userReviewRestaRDD.map(Review.normalize) # Subtract average values userReviewRestaNormLst = userReviewRestaCollNormRDD.collect() # map(Review.flatten).flatMap(Review.vectorize) userAvgDict = dict(userReviewRestaCollNormRDD.map(lambda x: (x[0], x[1])).collect()) userReviewRestaLst = parseUserBusiLst(userReviewRestaNormLst, userIdToNumDict, restIdToNumDict) userReviewRestaNormRDD = sc.parallelize(userReviewRestaLst) #usrResStarTupleRDD = reviewRestaRDD.map(Review.getUsrResStar) testReviewRestRDD = testReviewRDD.map(Review.toString).map(Review.getUsrResStar) print(userReviewRestaNormRDD.take(10)) testReviewRestRDD = testReviewRestRDD.filter(lambda x: x[0] in userListBC.value and x[1] in restListBC.value)\ .map(lambda x: Review.normalizeStar(userAvgDict, x))\ .map(lambda x: Review.replaceIDwithNum(x, userIdToNumDictBC, restIdToNumDictBC)) #.filter(lambda x: x[0] != 0 and x[1] != 0) print(testReviewRestRDD.take(10)) # and x[1] in restaurantListBC.value)\ return userReviewRestaNormRDD, testReviewRestRDD
import os import itertools import math from pyspark import SparkConf, SparkContext if __name__ == "__main__": conf = SparkConf() \ .setAppName("YelpReviewALS") \ .set("spark.executor.memory", "2g")\ .set("spark.python.worker.memory", "2g") sc = SparkContext('local', conf=conf) businessRDD = sc.textFile("../../../data/yelp_academic_dataset_business.json") sc.setCheckpointDir("checkpoints/") restaurantRDD = businessRDD.map(Business.to_string).filter(Business.is_res) for item in businessRDD.take(10): print(item) restaurantList = restaurantRDD.map(Business.get_id).collect() restBC = sc.broadcast(restaurantList) if os.path.exists("RestaurantReviews"): os.system("rm -rf RestaurantReviews") reviewRDD = sc.textFile("../../../data/yelp_academic_dataset_review_large.json")\ .filter(lambda x: Review.is_res(x, restBC))\ .map(Review.parseRatings)\ .saveAsTextFile("RestaurantReviews")
similarityEstimate = mat.columnSimilarities().entries if os.path.exists("../../../data/similarities"): os.removedirs("../../../data/similarities") similarityEstimate.saveAsTextFile("../../../data/similarities") #model = PowerIterationClustering.train(similarityEstimate, 2, 10) ''' restaurantSample = restTupleListRDD.take(20) for i in xrange(20): print(restaurantSample[i]) ''' # collect all possible values of every attribute, using accumulator # unfortunately not working.... ''' attributeAccum = sc.accumulator(restAttribSetBC, attributeAccumulatorParam()) restaurantRDD.map(lambda entry: (entry, restAttribSetBC)).foreach(lambda x: attributeAccum.add(x)) print(type(attributeAccum)) ''' reviewRDD = sc.textFile("../../../data/yelp_academic_dataset_review.json") reviewRestaRDD = reviewRDD.map(Review.toString).filter(lambda line: Review.is_res(line, restaurantListBC)) userReviewRestaRDD = reviewRestaRDD.map(Review.mapper).reduceByKey(Review.reducer) userRDD = sc.textFile("../../../data/yelp_academic_dataset_user.json") userRDD = userRDD.map(User.toString).map(User.getFriends) # user friendlist