def split_data_by_business(train_ratio_of_total = 0.5):
    """ Splits the data such that all reviews of a particular business end up in either the training set or the test set.  This prevents links between reviews from being lost during the split. """
    reviews = readyelp.read_reviews_to_dict("./reviews.json")
    users = readyelp.read_users_to_dict("./users.json")

    businesses = business_reviews_dict(reviews)

    train_ids = []
    test_ids = []

    for business_id in businesses:
        business_reviews = businesses[business_id]
        assignment = random.random()
        if assignment <= train_ratio_of_total:
            train_ids.extend(business_reviews)
        else:
            test_ids.extend(business_reviews)

    train = []
    test = []

    for train_id in train_ids:
        review = reviews[train_id]
        train.append(review)

    for test_id in test_ids:
        review = reviews[test_id]
        test.append(review)

    readyelp.write_output(train, "./train_reviews.json")
    readyelp.write_output(test, "./test_reviews.json")
Exemple #2
0
def split_data_by_business(train_ratio_of_total=0.5):
    """ Splits the data such that all reviews of a particular business end up in either the training set or the test set.  This prevents links between reviews from being lost during the split. """
    reviews = readyelp.read_reviews_to_dict("./reviews.json")
    users = readyelp.read_users_to_dict("./users.json")

    businesses = business_reviews_dict(reviews)

    train_ids = []
    test_ids = []

    for business_id in businesses:
        business_reviews = businesses[business_id]
        assignment = random.random()
        if assignment <= train_ratio_of_total:
            train_ids.extend(business_reviews)
        else:
            test_ids.extend(business_reviews)

    train = []
    test = []

    for train_id in train_ids:
        review = reviews[train_id]
        train.append(review)

    for test_id in test_ids:
        review = reviews[test_id]
        test.append(review)

    readyelp.write_output(train, "./train_reviews.json")
    readyelp.write_output(test, "./test_reviews.json")
def main():
    """ Splits the review data in reviews.json into training and testing data sets.  Reviews created on or before split_date are placed in the training set and reviews created afterward are placed in the test set. """

    users = readyelp.read_users_to_dict("./users.json")
    reviews = readyelp.read_reviews_to_dict("./reviews.json")
    cleanyelp.clean_review_dict(reviews, users)

    split_date = cleanyelp.median_date(reviews)

    train = []
    test = []

    for review_id in reviews:
        review = reviews[review_id]
        if len(review["friend_reviews_of_business"]) > 0:
            assignment = random.random()
            if assignment <= 0.5:
                test.append(review)
            else:
                train.append(review)
        else:
            train.append(review)
        # review_date = reviews[review_id]["date"]
        # if review_date <= split_date:
        #     train.append(reviews[review_id])
        # else:
        #     test.append(reviews[review_id])

    readyelp.write_output(train, "./train_reviews.json")
    readyelp.write_output(test, "./test_reviews.json")
Exemple #4
0
def filter_users():
    """ Removes from the set of users any users that do not have reviews in either the training or test datasets. """
    user_dict = readyelp.read_users_to_dict("./users.json")
    train_reviews = readyelp.read_reviews_to_dict("./train_reviews.json")
    test_reviews = readyelp.read_reviews_to_dict("./test_reviews.json")

    users_limited = []

    for user_id in user_dict:
        user = user_dict[user_id]
        user_review_list = user["reviews"]
        for review_id in user_review_list:
            if review_id not in train_reviews and review_id not in test_reviews:
                user_review_list.remove(review_id)
        if len(user_review_list) > 0:
            user["reviews"] = user_review_list
            users_limited.append(user)

    readyelp.write_output(users_limited, "./users.json")
def filter_users():
    """ Removes from the set of users any users that do not have reviews in either the training or test datasets. """
    user_dict = readyelp.read_users_to_dict("./users.json")
    train_reviews = readyelp.read_reviews_to_dict("./train_reviews.json")
    test_reviews = readyelp.read_reviews_to_dict("./test_reviews.json")

    users_limited = []

    for user_id in user_dict:
        user = user_dict[user_id]
        user_review_list = user["reviews"]
        for review_id in user_review_list:
            if review_id not in train_reviews and review_id not in test_reviews:
                user_review_list.remove(review_id)
        if len(user_review_list) > 0:
            user["reviews"] = user_review_list
            users_limited.append(user)

    readyelp.write_output(users_limited, "./users.json")
Exemple #6
0
def split_data(train_ratio_of_total=0.5):
    """ Splits the data randomly according to the ratio of training data to the total size of the data set provided.  The default argument of 0.5 splits the data evenly between training and test sets. """
    reviews = readyelp.read_reviews_to_dict("./reviews.json")
    users = readyelp.read_users_to_dict("./users.json")
    clean_review_dict(reviews, users)

    train = []
    test = []

    for review_id in reviews:
        review = reviews[review_id]
        assignment = random.random()
        if assignment <= train_ratio_of_total:
            train.append(review)
        else:
            test.append(review)

    readyelp.write_output(train, "./train_reviews.json")
    readyelp.write_output(test, "./test_reviews.json")
    filter_users()
def split_data(train_ratio_of_total = 0.5):
    """ Splits the data randomly according to the ratio of training data to the total size of the data set provided.  The default argument of 0.5 splits the data evenly between training and test sets. """
    reviews = readyelp.read_reviews_to_dict("./reviews.json")
    users = readyelp.read_users_to_dict("./users.json")
    clean_review_dict(reviews, users)

    train = []
    test = []

    for review_id in reviews:
        review = reviews[review_id]
        assignment = random.random()
        if assignment <= train_ratio_of_total:
            train.append(review)
        else:
            test.append(review)

    readyelp.write_output(train, "./train_reviews.json")
    readyelp.write_output(test, "./test_reviews.json")
    filter_users()
def clean_review_dict(review_dict, user_dict):
    """ Removes reviews created by users not in user_dict, standardizes star ratings to their appropriate klass, standardizes review date to python date object, and adds to each review a list of prior reviews of the same business by friends of the user. """
    ids_to_remove_from_reviews = []
    to_write_to_file = []
    for review_id in review_dict:
        review = review_dict[review_id]
        review["rating"] = _convert_star_rating_to_binary_klass(review["rating"])
        review_date_string = _convert_review_date(review["date"])
        if review["user_id"] not in user_dict:
            ids_to_remove_from_reviews.append(review_id)
        else:
            friend_reviews_of_business = find_influencers(review, review_dict, user_dict)
            if len(friend_reviews_of_business) == 0:
                ids_to_remove_from_reviews.append(review_id)
            else:
                review["friend_reviews_of_business"] = friend_reviews_of_business
                review_dict[review_id] = review
                to_write_to_file.append(review)
    for review_id in ids_to_remove_from_reviews:
        del review_dict[review_id]
    readyelp.write_output(to_write_to_file, "./reviews.json")
Exemple #9
0
def clean_review_dict(review_dict, user_dict):
    """ Removes reviews created by users not in user_dict, standardizes star ratings to their appropriate klass, standardizes review date to python date object, and adds to each review a list of prior reviews of the same business by friends of the user. """
    ids_to_remove_from_reviews = []
    to_write_to_file = []
    for review_id in review_dict:
        review = review_dict[review_id]
        review["rating"] = _convert_star_rating_to_binary_klass(
            review["rating"])
        review_date_string = _convert_review_date(review["date"])
        if review["user_id"] not in user_dict:
            ids_to_remove_from_reviews.append(review_id)
        else:
            friend_reviews_of_business = find_influencers(
                review, review_dict, user_dict)
            if len(friend_reviews_of_business) == 0:
                ids_to_remove_from_reviews.append(review_id)
            else:
                review[
                    "friend_reviews_of_business"] = friend_reviews_of_business
                review_dict[review_id] = review
                to_write_to_file.append(review)
    for review_id in ids_to_remove_from_reviews:
        del review_dict[review_id]
    readyelp.write_output(to_write_to_file, "./reviews.json")
#! /usr/bin/env python

""" Takes two arguments: the pathname for the Yelp Academic Dataset reviews json file and the users json file from the same dataset.  The dataset is available at https://www.yelp.com/dataset_challenge/dataset """

import readyelp
import cleanyelp
import splitdata
import baselineclassifier
import sys

review_path = sys.argv[1]
user_path = sys.argv[2]

reviews = []
users = []
reviews_by_user = {}
readyelp.parse_review_dataset_file(reviews, reviews_by_user, review_path)
readyelp.parse_user_dataset_file(users, reviews_by_user, user_path)
readyelp.write_output(reviews, "reviews.json")
readyelp.write_output(users, "users.json")

splitdata.main()

baselineclassifier.main()