def CF_evaluating(test_user_data,user_rating_table,restaurant_features): """ calculate evaluations using collaborative filtering test_user_data -- {user : {restaurant : [reviews]}} user_rating_table --{user : {restaurant : rating}} restaurant_features -- {restaurant: list}, where list is a word vector, mean-centered return evaluations -- {user : {restaurant : (true_rating, prediction)}} """ print len(test_user_data), " users to predict" cache = {} evaluations = dict() count = 0 for user,item_review_table_test in test_user_data.iteritems(): print count print "this user has ",len(item_review_table_test)," ratings to predict" # user -- John, item_review_table_test -- dict{restaurant:[reviews]} the test set for user John count = count + 1 evaluations[user] = dict() count1 = 0 for restaurant,reviews in item_review_table_test.iteritems(): print "doing prediction for test sample ", count1 count1 = count1 + 1 print "calculating true rating" true_rating = utils.cal_average_rating(reviews) print "calculating prediction" prediction = CF_prediction(user_rating_table[user],restaurant_features,restaurant,user,cache) evaluations[user][restaurant] = (true_rating,prediction) # if count > 100: # break return evaluations
def extracttfidf_user(user_indexed_reviews, all_reviews, restaurant_indexed_reviews): """ extract tf-idf feature for every user user_indexed_reviews {user_id : {business_id : [review]}} return word_count -- sparse array(word vector), ratings -- np array of label """ user_all_reviews = [] # count vector num in user_count user_count = dict() X_total = dict() y_total = dict() restaurant_feature = dict() ratings = [] for user in user_indexed_reviews: user_count[user] = 0 restaurant_reviews = user_indexed_reviews[user] for restaurant in restaurant_reviews: # extract feature reviews_content = '' reviews = restaurant_reviews[restaurant] for review in reviews: reviews_content += review['text'][0:len(review['text'])-1] if reviews_content == '': continue user_all_reviews.append(reviews_content) # compute label rating = round(utils.cal_average_rating(reviews)*2) ratings.append(rating) # count words user_count[user] += 1 user_all_reviews += all_reviews vectorizer = TfidfVectorizer(min_df=1) word_count = vectorizer.fit_transform(user_all_reviews) sum_count = 0 for user in user_indexed_reviews: if user_count[user] == 0: X_total[user] = None y_total[user] = None else: X_total[user] = word_count[sum_count:sum_count+user_count[user]+1, :] y_total[user] = np.array(ratings[sum_count:sum_count+user_count[user]+1]) sum_count += user_count[user] i = sum_count for restaurant in restaurant_indexed_reviews: restaurant_feature[restaurant] = word_count[i, :] i = i + 1 print i, sum_count return X_total,y_total,restaurant_feature
def svd_test(self, test_user_data): """ test svd model for test user data test_user_data -- {user : {restaurant : [reviews]}} return evaluations -- {user : {restaurant : (true_rating, prediction)}} """ evaluations = dict() for user_index, user in enumerate(test_user_data.keys()): evaluations[user] = dict() for restaurant_index, (restaurant, reviews) in enumerate(test_user_data[user].items()): true_rating = utils.cal_average_rating(reviews) prediction = self.predictScore(self.average, self.bu[user_index], self.br[restaurant_index], \ self.p[user_index], self.q[restaurant_index]) evaluations[user][restaurant] = (true_rating, prediction) return evaluations