Beispiel #1
0
def parseReviewToTrainingSet(reviewRDD, testReviewRDD):
    reviewRestaRDD = reviewRDD.map(Review.toString).filter(lambda line: Review.is_res(line, restaurantListBC))
    userList = reviewRestaRDD.map(Review.getuserid).sortByKey().collect()
    restList = reviewRestaRDD.map(Review.getbusiid).sortByKey().collect()
    userListBC = sc.broadcast(userList)
    restListBC = sc.broadcast(restList)

    print(userList[10])

    '''
    Generate Dictionaries of users in training set and broadcast
    '''
    userIdToNumDict, userNumToIdDict = assignNum(userList)

    userIdToNumDictBC = sc.broadcast(userIdToNumDict)
    userNumToIdDictBC = sc.broadcast(userNumToIdDict)

    '''
    Generate Dictionaries of Restaurants in training set and broadcast
    '''
    restIdToNumDict, restNumToIdDict = assignNum(restList)

    restIdToNumDictBC = sc.broadcast(restIdToNumDict)
    restNumToIdDictBC = sc.broadcast(restNumToIdDict)


    userReviewRestaRDD = reviewRestaRDD.map(Review.mapper).reduceByKey(Review.reducer).map(Review.reshape)
    userReviewRestaCollNormRDD = userReviewRestaRDD.map(Review.normalize)  # Subtract average values
    userReviewRestaNormLst = userReviewRestaCollNormRDD.collect()  # map(Review.flatten).flatMap(Review.vectorize)
    userAvgDict = dict(userReviewRestaCollNormRDD.map(lambda x: (x[0], x[1])).collect())

    userReviewRestaLst = parseUserBusiLst(userReviewRestaNormLst, userIdToNumDict, restIdToNumDict)

    userReviewRestaNormRDD = sc.parallelize(userReviewRestaLst)
    #usrResStarTupleRDD = reviewRestaRDD.map(Review.getUsrResStar)


    testReviewRestRDD = testReviewRDD.map(Review.toString).map(Review.getUsrResStar)

    print(userReviewRestaNormRDD.take(10))
    testReviewRestRDD = testReviewRestRDD.filter(lambda x: x[0] in userListBC.value and x[1] in restListBC.value)\
        .map(lambda x: Review.normalizeStar(userAvgDict, x))\
        .map(lambda x: Review.replaceIDwithNum(x, userIdToNumDictBC, restIdToNumDictBC))
        #.filter(lambda x: x[0] != 0 and x[1] != 0)
    print(testReviewRestRDD.take(10))
 # and x[1] in restaurantListBC.value)\
    return userReviewRestaNormRDD, testReviewRestRDD
Beispiel #2
0
import os
import itertools
import math

from pyspark import SparkConf, SparkContext

if __name__ == "__main__":
    conf = SparkConf() \
        .setAppName("YelpReviewALS") \
        .set("spark.executor.memory", "2g")\
        .set("spark.python.worker.memory", "2g")
    sc = SparkContext('local', conf=conf)

    businessRDD = sc.textFile("../../../data/yelp_academic_dataset_business.json")
    sc.setCheckpointDir("checkpoints/")
    restaurantRDD = businessRDD.map(Business.to_string).filter(Business.is_res)
    for item in businessRDD.take(10):
        print(item)
    restaurantList = restaurantRDD.map(Business.get_id).collect()

    restBC = sc.broadcast(restaurantList)

    if os.path.exists("RestaurantReviews"):
        os.system("rm -rf RestaurantReviews")

    reviewRDD = sc.textFile("../../../data/yelp_academic_dataset_review_large.json")\
        .filter(lambda x: Review.is_res(x, restBC))\
        .map(Review.parseRatings)\
        .saveAsTextFile("RestaurantReviews")
Beispiel #3
0
    numRestaurants = ratings.values().map(lambda r: r[1]).distinct().count()

    print("Got %d ratings from %d users on %d restaurants." % (numRatings, numUsers, numRestaurants))

    userList = ratings.values().map(lambda r: r[0]).distinct().collect()
    restList = ratings.values().map(lambda r: r[1]).distinct().collect()

    getUserIndex, getUserID = assignNum(userList)
    getRestIndex, getRestID = assignNum(restList)

    getUserIDBC = sc.broadcast(getUserID)
    getUserIndexBC = sc.broadcast(getUserIndex)
    getRestIndexBC = sc.broadcast(getRestIndex)
    getRestIDBC = sc.broadcast(getRestID)

    ratings = ratings.map(lambda (x, y): (x, Review.replaceIDwithNum(y, getUserIndexBC, getRestIndexBC)))
    usrRatingAvg = ratings.values().map(lambda x: (x[0], x[2])).reduceByKey(Review.reducer).map(Review.reshape)\
                    .filter(lambda x: len(x[1]) >= 3).map(Review.reshapeList)\
                    .map(lambda x: (x[0], sum(x[1])/float(len(x[1]))))
    usrRatingAvgBC = sc.broadcast(dict(usrRatingAvg.collect()))
    ratings = ratings.filter(lambda x: x[1][0] in usrRatingAvgBC.value).map(lambda (x, y): (x, Review.subtractAvg(y, usrRatingAvgBC)))

    numOfPartitions = 4

    trainingVal = ratings.filter(lambda x: x[0] <= 6) \
        .values()

    trainingMean = trainingVal.map(lambda x: x[2]).mean()

    training = trainingVal.repartition(numOfPartitions).cache()
Beispiel #4
0
    similarityEstimate = mat.columnSimilarities().entries
    if os.path.exists("../../../data/similarities"):
        os.removedirs("../../../data/similarities")
    similarityEstimate.saveAsTextFile("../../../data/similarities")
    #model = PowerIterationClustering.train(similarityEstimate, 2, 10)


    '''
    restaurantSample = restTupleListRDD.take(20)

    for i in xrange(20):
        print(restaurantSample[i])
    '''
    # collect all possible values of every attribute, using accumulator
    # unfortunately not working....
    '''
    attributeAccum = sc.accumulator(restAttribSetBC, attributeAccumulatorParam())
    restaurantRDD.map(lambda entry: (entry, restAttribSetBC)).foreach(lambda x: attributeAccum.add(x))

    print(type(attributeAccum))
    '''


    reviewRDD = sc.textFile("../../../data/yelp_academic_dataset_review.json")
    reviewRestaRDD = reviewRDD.map(Review.toString).filter(lambda line: Review.is_res(line, restaurantListBC))
    userReviewRestaRDD = reviewRestaRDD.map(Review.mapper).reduceByKey(Review.reducer)

    userRDD = sc.textFile("../../../data/yelp_academic_dataset_user.json")
    userRDD = userRDD.map(User.toString).map(User.getFriends)  # user friendlist