def CF_evaluating(test_user_data,user_rating_table,restaurant_features):
    """
    calculate evaluations using collaborative filtering
    test_user_data -- {user : {restaurant : [reviews]}}
    user_rating_table --{user : {restaurant : rating}}
    restaurant_features -- {restaurant: list}, where list is a word vector, mean-centered
    return evaluations -- {user : {restaurant : (true_rating, prediction)}}
    """
    
    print len(test_user_data), " users to predict"
    
    cache = {}
    evaluations = dict()
    count = 0
    for user,item_review_table_test in test_user_data.iteritems():
        print count
        print "this user has ",len(item_review_table_test)," ratings to predict"
        # user -- John, item_review_table_test -- dict{restaurant:[reviews]} the test set for user John
        count = count + 1
        evaluations[user] = dict()
        count1 = 0
        for restaurant,reviews in item_review_table_test.iteritems():
            print "doing prediction for test sample ", count1
            count1 = count1 + 1
            print "calculating true rating"
            true_rating = utils.cal_average_rating(reviews)
            print "calculating prediction"
            prediction = CF_prediction(user_rating_table[user],restaurant_features,restaurant,user,cache)
            evaluations[user][restaurant] = (true_rating,prediction)
        # if count > 100:
        #     break
    return evaluations
def extracttfidf_user(user_indexed_reviews, all_reviews, restaurant_indexed_reviews):
    """
    extract tf-idf feature for every user
    user_indexed_reviews {user_id : {business_id : [review]}}
    return word_count -- sparse array(word vector), ratings -- np array of label
    """
    user_all_reviews = []
    # count vector num in user_count
    user_count = dict()
    X_total = dict()
    y_total = dict()
    restaurant_feature = dict()
    ratings = []
    for user in user_indexed_reviews:
        user_count[user] = 0
        restaurant_reviews = user_indexed_reviews[user]
        for restaurant in restaurant_reviews:
            # extract feature
            reviews_content = ''
            reviews = restaurant_reviews[restaurant]
            for review in reviews:
                reviews_content += review['text'][0:len(review['text'])-1]
            if reviews_content == '':
                continue
            user_all_reviews.append(reviews_content)
            # compute label
            rating = round(utils.cal_average_rating(reviews)*2)
            ratings.append(rating)
            # count words
            user_count[user] += 1
    user_all_reviews += all_reviews
    vectorizer = TfidfVectorizer(min_df=1)
    word_count = vectorizer.fit_transform(user_all_reviews)

    sum_count = 0
    for user in user_indexed_reviews:
        if user_count[user] == 0:
            X_total[user] = None
            y_total[user] = None
        else:
            X_total[user] = word_count[sum_count:sum_count+user_count[user]+1, :]
            y_total[user] = np.array(ratings[sum_count:sum_count+user_count[user]+1])
        sum_count += user_count[user]

    i = sum_count
    for restaurant in restaurant_indexed_reviews:
        restaurant_feature[restaurant] = word_count[i, :]
        i = i + 1
    print i, sum_count
    return X_total,y_total,restaurant_feature
	def svd_test(self, test_user_data):
		"""
		test svd model for test user data
		test_user_data -- {user : {restaurant : [reviews]}}
		return evaluations -- {user : {restaurant : (true_rating, prediction)}}
		"""
		evaluations = dict()
		for user_index, user in enumerate(test_user_data.keys()):
			evaluations[user] = dict()
			for restaurant_index, (restaurant, reviews) in enumerate(test_user_data[user].items()):
				true_rating = utils.cal_average_rating(reviews)
				prediction = self.predictScore(self.average, self.bu[user_index], self.br[restaurant_index], \
	    				self.p[user_index], self.q[restaurant_index])
				evaluations[user][restaurant] = (true_rating, prediction)
		return evaluations