Esempio n. 1
0
def predicted_business_rating():
    id_rating_dict = {}
    mean = fuc.mean_user_stars()
    category_ratings = average_category_rating()
    for entry in training_business_data:
        actual_rating = entry['stars']
        sum_expected_rating = sum([category_ratings[category] for category in categories(entry)])
        num_categories = len(categories(entry))
        review_count = entry['review_count']
        try:
            expected_rating = sum_expected_rating / num_categories
        except ZeroDivisionError:
            #gives more weight to their average rating if more reviews
            expected_rating = (mean + review_count*actual_rating)/(1 + review_count)
        predicted_rating = (actual_rating*review_count + expected_rating)/(review_count + 1)
        id_rating_dict[entry['business_id']] = predicted_rating
    for entry in test_business_data:
        try:
            predicted_rating = id_rating_dict[entry['business_id']]
        except KeyError:
            sum_expected_rating = sum([category_ratings[category] for category in categories(entry)
                                       if category in category_set(training_business_data)])
            num_categories = len(categories(entry))
            try:
                expected_rating = sum_expected_rating / num_categories
            except ZeroDivisionError:
                expected_rating = mean
            predicted_rating = expected_rating
            id_rating_dict[entry['business_id']] = predicted_rating
    return id_rating_dict
Esempio n. 2
0
def main():
    #user_ids is test set_review_ids
    gender_ratings = gender.id_gender()
    test_gender_ratings = gender.test_id_gender()
    female_mean = gender.training_mean('female')
    male_mean = gender.training_mean('male')
    unknown_mean = gender.training_mean('unknown')
    both_mean = gender.training_mean('both')
    mean_stars = fuc.mean_user_stars()

    #for users in training user set
    id_stars_dict = id_stars()
    id_reviews = id_review_dict()

    #user_id stuff
    test_users = ua.review_test_users()
    training_users = ua.review_training_users()
    all_groups = ua.all_group_users()

    #these variables will be for users who have writing samples available
    parse_review_users = ua.review_training_test_users()
    parse_avg = rp.id_grade_avg()
    review_user_stars = rp.id_stars()
    review_stars_average = rp.id_stars_avg()

    #business stuff
    test_businesses = banal.review_test_businesses()
    training_businesses = banal.review_training_businesses()
    training_review_businesses = list(banal.id_stars().keys())
    test_categories = banal.test_id_categories()
    training_categories = banal.id_categories()
    grade_category_dict = banal.grade_categories_avg()
    global_grade_mean = sum(grade_category_dict.values())/len(grade_category_dict.values())
    expected_business_rating = banal.predicted_business_rating()
    business_grade_avg = banal.id_grade_avg()
    partition_dict = banal.partition_mean_std()
    partitions = sorted([float(key) for key in partition_dict.keys()])

    #for training_review_businesses only!
    business_star_avg = banal.id_star_avg()
    business_stars_list = banal.id_stars()

    #funny_useful_cool stuff, for training users only!
    fuc_rating_dict = fuc.predicted_rating()
    total_fuc_ratings = fuc.total_fuc()

    #where final ratings go for users
    user_ratings = {}
    #where user average grade levels go
    user_grades = {}
    #where final ratings go for businesses
    business_ratings = {}
    business_grades = {}

    #in case all of these loops don't contain certain users, initialize all users to the mean
    for user in user_ids:
        user_ratings[user] = mean_stars

    for user in training_users:
        if gender_ratings[user] == 'female':
            user_gender_rating = female_mean
        elif gender_ratings[user] == 'male':
            user_gender_rating = male_mean
        elif gender_ratings[user] == 'unknown':
            user_gender_rating = unknown_mean
        else:
            user_gender_rating = both_mean
        user_stars = id_stars_dict[user]
        review_count = id_reviews[user]
        #fuc rating given on 1-5 scale based on lms regression on funny, useful, cool ratings and star ratings
        fuc_rating = fuc_rating_dict[user]
        fuc_count = total_fuc_ratings[user] + 1
        rating = ((np.log(review_count)*user_stars + user_gender_rating
                  + fuc_rating*np.log(fuc_count))/(np.log(review_count) + np.log(fuc_count) + 1))
        user_ratings[user] = rating

    #in the future, must make test user predictions more accurate
    # by adding review count correlation with post intelligence
    for user in test_users:
        if test_gender_ratings[user] == 'female':
            user_gender_rating = female_mean
        elif test_gender_ratings[user] == 'male':
            user_gender_rating = male_mean
        elif test_gender_ratings[user] == 'unknown':
            user_gender_rating = unknown_mean
        else:
            user_gender_rating = both_mean
        rating = user_gender_rating
        user_ratings[user] = rating

    for user in parse_review_users:
        user_stars = review_stars_average[user]
        review_count = len(review_user_stars[user])
        rating = (user_stars*np.log(review_count) + mean_stars)/(np.log(review_count) + 1)
        user_ratings[user] = rating
        #create entry in grade dictionary for user writing grade
        user_grades[user] = parse_avg[user]

    for user in set(test_users).intersection(training_users):
        if gender_ratings[user] == 'female':
            user_gender_rating = female_mean
        elif gender_ratings[user] == 'male':
            user_gender_rating = male_mean
        elif gender_ratings[user] == 'unknown':
            user_gender_rating = unknown_mean
        else:
            user_gender_rating = both_mean
        user_stars = id_stars_dict[user]
        review_count = id_reviews[user]
        fuc_rating = fuc_rating_dict[user]
        fuc_count = total_fuc_ratings[user] + 1
        rating = ((np.log(review_count)*user_stars + user_gender_rating
                  + fuc_rating*np.log(fuc_count))/(np.log(review_count) + np.log(fuc_count) + 1))
        rating = ((np.log(review_count)*user_stars + user_gender_rating
                 )/(np.log(review_count) + 1))
        user_ratings[user] = rating

    for user in set(parse_review_users).intersection(training_users):
        if gender_ratings[user] == 'female':
            user_gender_rating = female_mean
        elif gender_ratings[user] == 'male':
            user_gender_rating = male_mean
        elif gender_ratings[user] == 'unknown':
            user_gender_rating = unknown_mean
        else:
            user_gender_rating = both_mean
        user_stars = id_stars_dict[user]
        user_stars_review = review_stars_average[user]
        review_count = id_reviews[user]
        review_count_reviews = len(review_user_stars[user])
        fuc_rating = fuc_rating_dict[user]
        fuc_count = total_fuc_ratings[user] + 1
        rating = ((user_stars*np.log(review_count)
                  + user_stars_review*np.log(review_count_reviews) + user_gender_rating
                  + fuc_rating*np.log(fuc_count))/(np.log(review_count_reviews)
                                                   + np.log(review_count)
                                                   + np.log(fuc_count) + 1))
        rating = ((user_stars*np.log(review_count)
                  + user_stars_review*np.log(review_count_reviews) + user_gender_rating
                  )/(np.log(review_count_reviews)
                                                   + np.log(review_count)
                                                    + 1))
        user_ratings[user] = rating
        user_grades[user] = parse_avg[user]

    for user in set(parse_review_users).intersection(test_users):
        if test_gender_ratings[user] == 'female':
            user_gender_rating = female_mean
        elif test_gender_ratings[user] == 'male':
            user_gender_rating = male_mean
        elif test_gender_ratings[user] == 'unknown':
            user_gender_rating = unknown_mean
        else:
            user_gender_rating = both_mean
        try:
            user_review_rating = review_stars_average[user]
        except KeyError:
            user_review_rating = mean_stars
        try:
            review_count_reviews = len(review_user_stars[user])
        except KeyError:
            review_count_reviews = 1.0
            print('error')
        user_stars = review_stars_average[user]
        review_count = len(review_user_stars[user])
        rating = (user_stars*np.log(review_count) + user_gender_rating
                  + user_review_rating*np.log(review_count_reviews))/(np.log(review_count)
                                                                      + np.log(review_count_reviews) + 1)
        user_ratings[user] = rating
        user_grades[user] = parse_avg[user]

    for user in all_groups:
        if test_gender_ratings[user] == 'female':
            user_gender_rating = female_mean
        elif test_gender_ratings[user] == 'male':
            user_gender_rating = male_mean
        elif test_gender_ratings[user] == 'unknown':
            user_gender_rating = unknown_mean
        else:
            user_gender_rating = both_mean
        try:
            user_stars = id_stars_dict[user]
        except KeyError:
            user_stars = mean_stars
        user_stars_review = review_stars_average[user]
        review_count = id_reviews[user]
        review_count_reviews = len(review_user_stars[user])
        fuc_rating = fuc_rating_dict[user]
        fuc_count = total_fuc_ratings[user] + 1
        rating = ((user_stars*np.log(review_count)
                  + user_stars_review*np.log(review_count_reviews) + user_gender_rating
                  + fuc_rating*np.log(fuc_count))/(2*np.log(review_count_reviews)
                                                   + np.log(review_count)
                                                   + np.log(fuc_count) + 1))
        rating = ((user_stars*np.log(review_count)
                  + user_stars_review*np.log(review_count_reviews) + user_gender_rating
                  )/(2*np.log(review_count_reviews)
                                                   + np.log(review_count)
                                                    + 1))
        user_ratings[user] = rating
        user_grades[user] = parse_avg[user]

    #business stuff, fill this
    for business in training_review_businesses:
        rating_count = len(business_stars_list[business])
        business_avg = business_star_avg[business]
        rating = (mean_stars + business_avg*np.log(rating_count))/(1 + np.log(rating_count))
        business_ratings[business] = rating
        business_grades[business] = business_grade_avg[business]

    for business in training_businesses:
        categories = training_categories[business]
        category_grade_list = []
        for category in categories:
            try:
                category_grade_list.append(grade_category_dict[category])
            except KeyError:
                continue
        try:
            category_grade_rating = sum(category_grade_list)/len(category_grade_list)
        except ZeroDivisionError:
            category_grade_rating = global_grade_mean
        rating = expected_business_rating[business]
        business_ratings[business] = rating
        business_grades[business] = category_grade_rating

    for business in test_businesses:
        categories = test_categories[business]
        category_grade_list = []
        for category in categories:
            try:
                category_grade_list.append(grade_category_dict[category])
            except KeyError:
                continue
        try:
            category_grade_rating = sum(category_grade_list)/len(category_grade_list)
        except ZeroDivisionError:
            category_grade_rating = global_grade_mean
        rating = expected_business_rating[business]
        business_ratings[business] = rating
        business_grades[business] = category_grade_rating

    for business in set(test_businesses).intersection(training_businesses):
        categories = training_categories[business]
        category_grade_list = []
        for category in categories:
            try:
                category_grade_list.append(grade_category_dict[category])
            except KeyError:
                continue
        try:
            category_grade_rating = sum(category_grade_list)/len(category_grade_list)
        except ZeroDivisionError:
            category_grade_rating = global_grade_mean
        rating = expected_business_rating[business]
        business_ratings[business] = rating
        business_grades[business] = category_grade_rating

    for business in set(test_businesses).intersection(training_review_businesses):
        rating_count = len(business_stars_list[business])
        business_avg = business_star_avg[business]
        rating = (expected_business_rating[test_businesses]
                  + business_avg*np.log(rating_count))/(1 + np.log(rating_count))
        business_ratings[business] = rating
        business_grades[business] = business_grade_avg[business]

    for business in set(training_businesses).intersection(training_review_businesses):
        rating_count = len(business_stars_list[business])
        business_avg = business_star_avg[business]
        rating = (expected_business_rating[business]
                  + business_avg*np.log(rating_count))/(1 + np.log(rating_count))
        business_ratings[business] = rating
        business_grades[business] = business_grade_avg[business]

    with open('yelp_test_set_review.json') as file:
        final_data = [json.loads(entry) for entry in file]
        ratings = []
        for i in range(len(final_data)):
            try:
                #calculate writing level difference
                diff = user_grades[final_data[i]['user_id']] - business_grades[final_data[i]['business_id']]
                #and then calculate expected contribution to star rating prediction for user business pair
                partition = -3.83205
                for j in range(len(partitions)):
                    if diff < partitions[j]:
                        partition = partitions[j]
                    else:
                        partition = partitions[-1]
                diff_score = partition_dict[str(partition)][0]
            except KeyError:
                diff_score = 0
            rating = (user_ratings[final_data[i]['user_id']]
                                  + business_ratings[final_data[i]['business_id']])/2
            if rating > 5:
                rating = 5.0
            elif rating < 1:
                rating = 1.0
            ratings.append({'review_id': final_data[i]['review_id'], 'stars': rating})
    keys = ['review_id', 'stars']
    f = open('newfuc.csv', 'w')
    dict_writer = csv.DictWriter(f, keys)
    dict_writer.writer.writerow(keys)
    dict_writer.writerows(ratings)